In [64]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

In [40]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

In [4]:
# Makes it worse, don't scale
minmax_scaler = MinMaxScaler(feature_range=(0,1))

In [43]:
titanic_train = pd.read_csv('data/train.csv')

## EDA / Cleaning
***

In [44]:
# Turn Sex into Bool values
titanic_train['Sex'] = titanic_train['Sex'].map(lambda x: 0 if x == 'female' else 1)

In [45]:
# Turn Embarked into dummy variables, drop first dummy variable
embarked_dummies = pd.get_dummies(titanic_train['Embarked'], drop_first=True, prefix='Embarked_')

titanic_train = pd.concat([titanic_train, embarked_dummies], axis=1)

In [46]:
# drop unnecessary columns: ['Embarked', 'Name', 'Ticket', 'Cabin', 'Survived']
titanic_train.drop(columns=['Embarked', 'Name', 'Ticket', 'Cabin', 'Survived', 'PassengerId'], inplace=True)

In [47]:
# Scale Fare values
# titanic_train['scaler_fare'] = minmax_scaler.fit_transform(np.array(titanic_train[['Fare']]))

In [48]:
# Separate NaN values
titanic_age_null = pd.DataFrame(titanic_train[titanic_train.isnull().any(axis=1)])
titanic_age_null.drop(columns='Age', inplace=True)
titanic_age_null.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked__Q,Embarked__S
5,3,1,0,0,8.4583,1,0
17,2,1,0,0,13.0,0,1
19,3,0,0,0,7.225,0,0
26,3,1,0,0,7.225,0,0
28,3,0,0,0,7.8792,1,0


In [49]:
# Drop NaN and scale Age values
titanic_train.dropna(inplace=True)
# titanic_train['scaler_age'] = minmax_scaler.fit_transform(np.array(titanic_train[['Age']]))

In [50]:
titanic_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked__Q,Embarked__S
0,3,1,22.0,1,0,7.2500,0,1
1,1,0,38.0,1,0,71.2833,0,0
2,3,0,26.0,0,0,7.9250,0,1
3,1,0,35.0,1,0,53.1000,0,1
4,3,1,35.0,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...
885,3,0,39.0,0,5,29.1250,1,0
886,2,1,27.0,0,0,13.0000,0,1
887,1,0,19.0,0,0,30.0000,0,1
889,1,1,26.0,0,0,30.0000,0,0


## Creating Model to Predict Age NaN Values
***
### Train / Test Split

In [51]:
# Separate and split X and y values
X = titanic_train.drop(columns='Age')
y = titanic_train['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [52]:
X

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked__Q,Embarked__S
0,3,1,1,0,7.2500,0,1
1,1,0,1,0,71.2833,0,0
2,3,0,0,0,7.9250,0,1
3,1,0,1,0,53.1000,0,1
4,3,1,0,0,8.0500,0,1
...,...,...,...,...,...,...,...
885,3,0,0,5,29.1250,1,0
886,2,1,0,0,13.0000,0,1
887,1,0,0,0,30.0000,0,1
889,1,1,0,0,30.0000,0,0


### Gridsearch Models / Hyperparameters

In [54]:
models_dict = {'LinearRegression':LinearRegression(), 'Ridge':Ridge(), 
              'Lasso':Lasso(), 'RandomForestRegressor':RandomForestRegressor(),
              'GradientBoostingRegressor':GradientBoostingRegressor(),
              'AdaBoostRegressor':AdaBoostRegressor()}

In [56]:
params_dict = {'LinearRegression':{'normalize':[True, False], 'n_jobs':[-1]}, 
               'Ridge':{'normalize':[True, False], 'alpha':[0.01, 0.5, 3, 7]},
               'Lasso':{'normalize':[True, False], 'alpha':[0.01, 0.5, 3, 7]},
               'RandomForestRegressor':{'n_estimators':[100, 300, 500, 700],
                                        'n_jobs':[-1],
                                        'bootstrap':[True, False], 
                                        'max_features':['auto', 'sqrt']},
               'GradientBoostingRegressor':{'learning_rate':[0.01, 0.05, 0.1], 
                                            'n_estimators':[100, 300, 500, 700], 
                                            'max_features':['auto', 'sqrt']},
               'AdaBoostRegressor':{'learning_rate':[0.01, 0.05, 0.1], 
                                    'n_estimators':[100, 300, 500, 700], 
                                    'loss':['linear', 'square']}}

In [72]:

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
#             print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [73]:
helper1 = EstimatorSelectionHelper(models_dict, params_dict)

In [74]:
helper1.fit(X_train, y_train)

Running GridSearchCV for LinearRegression.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for Ridge.
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Running GridSearchCV for Lasso.
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Running GridSearchCV for RandomForestRegressor.
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  24 out of  24 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  24 out of  24 | elapsed:    0.1s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:   13.7s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Running GridSearchCV for GradientBoostingRegressor.
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=3)]: Done  72 out of  72 | elapsed:    3.9s finished
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Running GridSearchCV for AdaBoostRegressor.
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=3)]: Done  50 tasks      | elapsed:    8.8s
[Parallel(n_jobs=3)]: Done  72 out of  72 | elapsed:   11.3s finished


In [76]:
gs_models_result = helper1.score_summary(sort_by='max_score')
gs_models_result

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,n_jobs,normalize,alpha,bootstrap,max_features,n_estimators,learning_rate,loss
58,AdaBoostRegressor,0.229391,0.266283,0.319419,0.038507,,,,,,100,0.01,linear
62,AdaBoostRegressor,0.230354,0.263364,0.306876,0.0321103,,,,,,100,0.01,square
39,GradientBoostingRegressor,0.238923,0.270062,0.30547,0.0273346,,,,,sqrt,300,0.01,
59,AdaBoostRegressor,0.234708,0.265855,0.303964,0.028699,,,,,,300,0.01,linear
40,GradientBoostingRegressor,0.248236,0.277102,0.303959,0.0227934,,,,,sqrt,500,0.01,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,RandomForestRegressor,-0.245274,-0.113323,0.0175026,0.107281,-1,,,False,auto,100,,
17,Lasso,-0.0218402,-0.0148163,-0.00395691,0.0077883,,False,7,,,,,
16,Lasso,-0.0198224,-0.00971113,-0.00450634,0.00715079,,True,7,,,,,
14,Lasso,-0.0198224,-0.00971113,-0.00450634,0.00715079,,True,3,,,,,


In [78]:
gs_models_result[gs_models_result['max_score'] == max(gs_models_result['max_score'])]

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,n_jobs,normalize,alpha,bootstrap,max_features,n_estimators,learning_rate,loss
58,AdaBoostRegressor,0.229391,0.266283,0.319419,0.038507,,,,,,100,0.01,linear
