# Import libraries

## Base

In [103]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import scipy as s
import sklearn

import IPython

import os
import random
import time

%run ../scripts/config.py

## Modelling

### Algorithms

In [104]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

### Helpers

In [105]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

## Viz

In [106]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

In [107]:
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

# Set up

In [108]:
PATHS = setup_paths()

In [109]:
TRAIN_PATH = os.path.join(PATHS['data_raw'], 'train.csv')
TEST_PATH = os.path.join(PATHS['data_raw'], 'test.csv')


# Import data

In [110]:
df_train  = pd.read_csv(TRAIN_PATH)

df_test  = pd.read_csv(TEST_PATH)

In [111]:
data_cleaner = [df_train, df_test]

# Exploration

https://www.kaggle.com/c/titanic/data

In [112]:
df_train.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
745,746,0,1,"Crosby, Capt. Edward Gifford",male,70.0,1,1,WE/P 5735,71.0,B22,S
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S
255,256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29.0,0,2,2650,15.25,,C
446,447,1,2,"Mellinger, Miss. Madeleine Violet",female,13.0,0,1,250644,19.5,,S
123,124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13.0,E101,S


## Nulls values

In [113]:
print("Null values in training data:")
print(df_train.isnull().sum())
print("-"*50)
print("Null values in test data:")
print(df_test.isnull().sum())

Null values in training data:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
--------------------------------------------------
Null values in test data:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


# Clean data

In [114]:
for dataset in data_cleaner:
    dataset['Age'] = dataset.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    dataset['Embarked'] = dataset['Embarked'].fillna(dataset['Embarked'].mode()[0])
    dataset['Fare'] = dataset.groupby(['Pclass', 'Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))

drop_columns = ['PassengerId','Cabin', 'Ticket']

df_train.drop(columns=drop_columns, axis=1, inplace=True)

# Feature Engineering

In [116]:
for dataset in data_cleaner:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 # Hermanos/Conyuges + /Padres/Hijos + La persona
    dataset['IsAlone'] = np.where(dataset['FamilySize'] == 1, 1, 0 )
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(". ", expand=True)[0]
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'], bins= [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

    rare_titles = dataset.value_counts('Title').loc[lambda x: x < 10].index
    dataset['Title'] = dataset['Title'].apply(lambda x: 'Misc' if x in rare_titles else x)

df_train.sample(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Title,FareBin,AgeBin
425,0,3,"Wiseman, Mr. Phillippe",male,25.0,0,0,7.25,S,1,1,Mr,"(-0.001, 7.91]","(20, 30]"
145,0,2,"Nicholls, Mr. Joseph Charles",male,19.0,1,1,36.75,S,3,0,Mr,"(31.0, 512.329]","(10, 20]"
21,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,13.0,S,1,1,Mr,"(7.91, 14.454]","(30, 40]"
453,1,1,"Goldenberg, Mr. Samuel L",male,49.0,1,0,89.1,C,2,0,Mr,"(31.0, 512.329]","(40, 50]"
539,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22.0,0,2,49.5,C,3,0,Miss,"(31.0, 512.329]","(20, 30]"


In [None]:
label = LabelEncoder()