In [51]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [98]:
df = pd.read_csv('data/titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [99]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [100]:
df = pd.get_dummies(df, columns=["Pclass", "Sex", "Embarked"], drop_first=True)

In [101]:
df.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,1,0,1


In [102]:
df = df.dropna(subset=["Age"])

In [103]:
x = df.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin", "Fare", "Parch"], axis=1)
y = df["Survived"].values
x.shape, y.shape

((714, 7), (714,))

In [104]:
x.head()

Unnamed: 0,Age,SibSp,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,22.0,1,0,1,1,0,1
1,38.0,1,0,0,0,0,0
2,26.0,0,0,1,0,0,1
3,35.0,1,0,0,0,0,1
4,35.0,0,0,1,1,0,1


In [105]:
age_scaler = StandardScaler()
x.Age = age_scaler.fit_transform(x.Age.values.reshape(-1, 1))
x.head()

Unnamed: 0,Age,SibSp,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,-0.530377,1,0,1,1,0,1
1,0.571831,1,0,0,0,0,0
2,-0.254825,0,0,1,0,0,1
3,0.365167,1,0,0,0,0,1
4,0.365167,0,0,1,1,0,1


In [106]:
# sib_sp_scaler = StandardScaler()
# x.SibSp = sib_sp_scaler.fit_transform(x.SibSp.values.reshape(-1,1))
# x.head()

In [107]:
x.isna().sum()

Age           0
SibSp         0
Pclass_2      0
Pclass_3      0
Sex_male      0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [108]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [109]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [110]:
from sklearn.model_selection import GridSearchCV
gsc_rf_model = GridSearchCV(RandomForestClassifier(), {'max_depth': range(1,10)})
gsc_rf_model.fit(x_train, y_train)
gsc_rf_model.best_score_, gsc_rf_model.best_params_

(0.8354538520213577, {'max_depth': 5})

In [111]:
rfc_model = gsc_rf_model.best_estimator_

In [112]:
rfc_model.fit(x_train, y_train)
# rfc_model.get_params()

RandomForestClassifier(max_depth=5)

In [113]:
from sklearn.metrics import classification_report
print("Test")
print("----")
print(classification_report(y_test, rfc_model.predict(x_test)))
print()
print("Train")
print("----")
print(classification_report(y_train, rfc_model.predict(x_train)))
print()
print("All")
print("----")
print(classification_report(y, rfc_model.predict(x)))

Test
----
              precision    recall  f1-score   support

           0       0.79      0.91      0.84        87
           1       0.81      0.62      0.71        56

    accuracy                           0.80       143
   macro avg       0.80      0.77      0.78       143
weighted avg       0.80      0.80      0.79       143


Train
----
              precision    recall  f1-score   support

           0       0.82      0.95      0.88       337
           1       0.90      0.71      0.79       234

    accuracy                           0.85       571
   macro avg       0.86      0.83      0.84       571
weighted avg       0.85      0.85      0.84       571


All
----
              precision    recall  f1-score   support

           0       0.82      0.94      0.87       424
           1       0.88      0.69      0.78       290

    accuracy                           0.84       714
   macro avg       0.85      0.81      0.82       714
weighted avg       0.84      0.84      0.8

In [68]:
import pickle
import time

In [69]:
fp = open(f"data/models/rfc_5_{time.time()}", "wb")
pickle.dump(rfc_model, fp)
fp.close()
fp = open(f"data/scaler/age_scaler_{time.time()}", "wb")
pickle.dump(age_scaler, fp)
fp.close()

In [70]:
with open("data/scaler/age_scaler", "rb") as fp:
    a_scaler = pickle.load(fp)