In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%matplotlib inline

In [2]:
#Train and test data are merged together since it is a kaggle competition.  This eliminates possibilities that will not be detected

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [8]:
train['data_source'] = 'Train'
test['data_source'] = 'Test'

In [9]:
data = pd.concat([train, test])

In [10]:
data.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
data_source       0
dtype: int64

In [11]:
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,data_source
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Train
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Train
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Train
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Train
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Train


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
data_source    1309 non-null object
dtypes: float64(3), int64(4), object(6)
memory usage: 143.2+ KB


In [13]:
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,data_source
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Train
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Train
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Train
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Train
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Train


In [14]:
data['Cabin'].fillna('NA', inplace=True)

In [15]:
# Fill missing age with average ages of that pclass
data['Age'] = data.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.mean()))

In [16]:
data['Fare'].fillna(data['Fare'].mean(), inplace=True)

In [17]:
#fillna with mode
from scipy.stats import mode
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 13 columns):
Age            1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
Fare           1309 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
data_source    1309 non-null object
dtypes: float64(3), int64(4), object(6)
memory usage: 143.2+ KB


In [19]:
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,data_source
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Train
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Train
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Train
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Train
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Train


In [20]:
data.columns

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket', 'data_source'],
      dtype='object')

In [21]:
col_dummies = ['Cabin', 'Embarked', 'Pclass', 'Sex']
data = pd.get_dummies(data, columns= col_dummies)

In [22]:
data_train = data[data['data_source']=='Train']

In [23]:
data_train = data_train.drop(['Name', 'PassengerId', 'Ticket', 'data_source'], axis=1)

In [24]:
X = data_train.drop('Survived', axis=1)
y = data_train['Survived']

In [25]:
X.shape

(891, 199)

Model Training

In [26]:
# Importing training models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [27]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=99)

In [28]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

712 179 712 179


In [29]:
pipelines = {'l1': make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', random_state=123)),
             'l2': make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', random_state=123)),
             'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123)),
             'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=123))
            }

In [30]:
l1_hyperparameters = {'logisticregression__C': np.linspace(1e-3, 1e3, 10)}
l2_hyperparameters = {'logisticregression__C': np.linspace(1e-3, 1e3, 10)}
rf_hyperparameters = {'randomforestclassifier__n_estimators': [100, 200],
                     'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33],
                     }
gb_hyperparameters = {'gradientboostingclassifier__n_estimators': [100,200],
                     'gradientboostingclassifier__max_depth': [1,3,5],
                     'gradientboostingclassifier__learning_rate':[0.05, 0.1, 0.2]
                     }

In [31]:
hyperparameters = {'l1': l1_hyperparameters,
                  'l2': l2_hyperparameters,
                  'rf': rf_hyperparameters,
                  'gb': gb_hyperparameters
                  }

In [32]:
fitted_models = {}
for name, pipeline in pipelines.items():
    model  = GridSearchCV(pipeline, hyperparameters[name], cv=5, n_jobs=1)
    model.fit(X_train, y_train)
    
    fitted_models[name] = model
    print('{} has been fitted'.format(name))
    

l1 has been fitted
l2 has been fitted
rf has been fitted
gb has been fitted


In [33]:
#Evaluate the metrics

from sklearn.metrics import roc_curve, auc
for name, model in fitted_models.items():
    print(name, model.best_score_)

l1 0.804775280899
l2 0.799157303371
rf 0.823033707865
gb 0.832865168539


In [34]:
for name, model in fitted_models.items():
    pred = model.predict_proba(X_test)
    pred = pred[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    print(name, auc(fpr, tpr))
    

l1 0.715473317243
l2 0.805980155538
rf 0.85136765889
gb 0.843255564495


In [35]:
fitted_models['rf'].best_estimator_

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.33, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_i...imators=100, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [36]:
fitted_models['rf'].best_params_

{'randomforestclassifier__max_features': 0.33,
 'randomforestclassifier__n_estimators': 100}

In [37]:
data_test = data[data['data_source'] == 'Test']

In [38]:
data_test = data_test.drop(['Name', 'PassengerId', 'Ticket', 'data_source'], axis=1)

In [39]:
X_test_kaggle = data_test.drop('Survived', axis=1)
X_test_kaggle.shape

(418, 199)

In [40]:
X_test_kaggle.head()

Unnamed: 0,Age,Fare,Parch,SibSp,Cabin_A10,Cabin_A11,Cabin_A14,Cabin_A16,Cabin_A18,Cabin_A19,...,Cabin_NA,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male
0,34.5,7.8292,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,1
1,47.0,7.0,0,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0
2,62.0,9.6875,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
3,27.0,8.6625,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,1
4,22.0,12.2875,1,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0


In [41]:
pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(random_state=123, max_features=0.33, n_estimators=100))

In [42]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.33, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_i...imators=100, n_jobs=1,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [44]:
y_pred_kaggle = pipeline.predict(X_test_kaggle)