In [38]:
%load_ext autoreload
%autoreload 2

from fetch_data import TitanicData, ModelInput
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

## Fetch Data

In [2]:
t = TitanicData()
t.fetch_data()

In [3]:
t.data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Feature Engineering: First Impressions
New columns representing:
- Fill null Age values with average by Sex, Embarked
- Convert Pclass, Sex, SibSp, and Parch to categorical (get_dummies)
- CabinFlag: 1 or 0
- Cabin Section and make categorical
- LastName
- CabinOccupants
- Scale (after train-test-split)
- NLP on Name: CountVectorizer and/or TFIDF


In [4]:
t.fill_age()
t.get_last_name()
t.get_section()
t.add_cabin_flag()


In [5]:
t.make_categorical('Pclass')
t.make_categorical('Sex')
t.make_categorical('SibSp')
t.make_categorical('Parch')
t.make_categorical('Embarked')
t.make_categorical('Section')

In [6]:
t.data.head()

Unnamed: 0,PassengerId,Survived,Name,Age,Ticket,Fare,Cabin,LastName,CabinFlag,Pclass_2,...,Parch_6,Embarked_Q,Embarked_S,Section_B,Section_C,Section_D,Section_E,Section_F,Section_G,Section_T
0,1,0,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.25,,Braund,0,0,...,0,0,1,0,0,0,0,0,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599,71.2833,C85,Cumings,1,0,...,0,0,0,0,1,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.925,,Heikkinen,0,0,...,0,0,1,0,0,0,0,0,0,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803,53.1,C123,Futrelle,1,0,...,0,0,1,0,1,0,0,0,0,0
4,5,0,"Allen, Mr. William Henry",35.0,373450,8.05,,Allen,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
t.data.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin',
       'LastName', 'CabinFlag', 'Pclass_2', 'Pclass_3', 'Sex_male', 'SibSp_1',
       'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_1',
       'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_Q',
       'Embarked_S', 'Section_B', 'Section_C', 'Section_D', 'Section_E',
       'Section_F', 'Section_G', 'Section_T'],
      dtype='object')

## Build Input for Model
Set the features to be used for training the model. Then do a Train-Test Split and Scale Numerical Data.

In [31]:
mi = ModelInput(t.data)

mi.set_features()
mi.train_test_split(test_size=0.2)
mi.scale()

In [32]:
mi.model_input.head()

Unnamed: 0,Age,Fare,Pclass_2,Pclass_3,Sex_male,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,...,Embarked_Q,Embarked_S,Section_B,Section_C,Section_D,Section_E,Section_F,Section_G,Section_T,Survived
0,22.0,7.25,0,1,1,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,38.0,71.2833,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,26.0,7.925,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3,35.0,53.1,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
4,35.0,8.05,0,1,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [33]:
mi.X_train.head()

Unnamed: 0,Age,Fare,Pclass_2,Pclass_3,Sex_male,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,...,Parch_6,Embarked_Q,Embarked_S,Section_B,Section_C,Section_D,Section_E,Section_F,Section_G,Section_T
329,0.195778,0.113168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
749,0.384267,0.015127,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,0.566474,0.014102,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
421,0.258608,0.015094,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.28374,0.123667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Grid Search & Random Forest Classifier
Use GridSearchCV to iteratively train several Random Forest Classifers, find the best parameters, and make predictions.

In [51]:
params = {'n_estimators': [100, 300, 1000, 3000], 'max_depth': [5, 7, 9, 10]}

rf = RandomForestClassifier(criterion='gini', 
                            n_jobs=2, 
                            random_state=123)

gs = GridSearchCV(rf, params, verbose=1)
gs.fit(mi.X_train, mi.y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  3.0min finished


GridSearchCV(estimator=RandomForestClassifier(n_jobs=2, random_state=123),
             param_grid={'max_depth': [5, 7, 9, 10],
                         'n_estimators': [100, 300, 1000, 3000]},
             verbose=1)

In [52]:
gs.cv_results_

{'mean_fit_time': array([0.24700112, 0.60831957, 1.73202462, 4.9161572 , 0.24756355,
        0.60513272, 1.76708574, 4.97200866, 0.24777331, 0.61082258,
        1.81785836, 5.02494483, 0.25019569, 0.6234942 , 1.79831243,
        5.07481794]),
 'std_fit_time': array([0.01688689, 0.04351045, 0.06559815, 0.12379242, 0.02181844,
        0.01678572, 0.02599535, 0.08235507, 0.01066843, 0.01947451,
        0.04593343, 0.07295179, 0.01320786, 0.03051452, 0.01640661,
        0.04936036]),
 'mean_score_time': array([0.10499668, 0.10479164, 0.30663881, 0.70827398, 0.1048532 ,
        0.10462198, 0.30605283, 0.70879445, 0.10489755, 0.10441933,
        0.30586066, 0.70811167, 0.10482368, 0.10482831, 0.30612745,
        0.70796914]),
 'std_score_time': array([0.00023869, 0.0004671 , 0.00093944, 0.00085783, 0.00019337,
        0.00048141, 0.00017092, 0.00217867, 0.00028558, 0.00025461,
        0.00034137, 0.00052131, 0.00025868, 0.00031277, 0.00061251,
        0.00039525]),
 'param_max_depth': masked

In [53]:
best_idx = gs.cv_results_['rank_test_score'].argmin()
best_params = gs.cv_results_['params'][best_idx]
print(f'Best Params: {best_params}')

Best Params: {'max_depth': 9, 'n_estimators': 1000}


## Apply Optimized Parameters to Classifier

In [54]:
rf = RandomForestClassifier(n_estimators=best_params['n_estimators'], 
                            max_depth=best_params['max_depth'], 
                            criterion='gini', 
                            n_jobs=2, 
                            random_state=123)


In [55]:
rf.fit(mi.X_train, mi.y_train)
y_pred = rf.predict(mi.X_test)

In [56]:
confusion_matrix(mi.y_test, y_pred)

array([[104,  10],
       [ 15,  50]])

# Unseen Test Data

In [58]:
import pandas as pd
test = pd.read_csv('../data/test.csv')

In [59]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [61]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


# Second Impressions
- AgeBin
- Modify SibSp and Parch to categorical
- FareBin

In [None]:
len(t.data['Cabin'].unique())