In [1]:
import warnings
warnings.filterwarnings('ignore')
# data cleaning
import numpy as np
import pandas as pd
from collections import Counter
# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#data preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#data modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor

#Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

#model evaluation
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix,plot_confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, average_precision_score, f1_score, log_loss
from sklearn.metrics import roc_curve, auc, plot_roc_curve, roc_auc_score, plot_precision_recall_curve

**Read and inspect data**

In [2]:
df= pd.read_csv('titanic_csv.csv',index_col=0)

In [3]:
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest
1,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,"St Louis, MO"
2,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,"Montreal, PQ / Chesterville, ON"
5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,"Montreal, PQ / Chesterville, ON"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  home.dest  745 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 132.9+ KB


In [5]:
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
home.dest     564
dtype: int64

In [6]:
df= df.interpolate()

In [7]:
df.isna().sum()

pclass          0
survived        0
name            0
sex             0
age             0
sibsp           0
parch           0
ticket          0
fare            0
cabin        1014
embarked        2
home.dest     564
dtype: int64

In [8]:
df.survived.value_counts()

0    809
1    500
Name: survived, dtype: int64

# Random forest w/o feature selection

**1.Choose feature**

In [9]:
x= df[['pclass','sex','age','sibsp','parch','fare','embarked']]
x= pd.get_dummies(x)

In [10]:
y= df['survived']

**2.Split data**

In [11]:
x_train,x_test,y_train,y_test= train_test_split(x,y, test_size= 0.3, random_state= 123)

**3.train model**

In [12]:
model1= RandomForestClassifier(n_estimators= 100,oob_score= True,n_jobs= -1, random_state= 42 )
#Predictive power
# n_estimators: number of tree
# max_features
# min_sample_leaf
# n_jobs= -1 >> increase speed
# oob_score= True >> cross-validation method

In [13]:
model1.fit(x_train,y_train)

RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=42)

In [14]:
model1.oob_score_

0.7816593886462883

**Evaluate**

In [15]:
model1.score(x_train,y_train)

0.9847161572052402

In [16]:
model1.score(x_test,y_test)

0.8142493638676844

# Random forest with feature selection

In [17]:
feature_importance= pd.Series(model1.feature_importances_, index= x.columns).sort_values(ascending=False)
feature_importance

age           0.293862
fare          0.258022
sex_male      0.137798
sex_female    0.117228
pclass        0.071713
sibsp         0.043912
parch         0.039377
embarked_C    0.019208
embarked_S    0.013159
embarked_Q    0.005721
dtype: float64

**1.Choose feature**

In [18]:
feature= feature_importance.index[feature_importance>0.1].to_list()
feature

['age', 'fare', 'sex_male', 'sex_female']

In [19]:
x2= x[feature]
x2

Unnamed: 0,age,fare,sex_male,sex_female
1,29.0000,211.3375,0,1
2,0.9167,151.5500,1,0
3,2.0000,151.5500,0,1
4,30.0000,151.5500,1,0
5,25.0000,151.5500,0,1
...,...,...,...,...
1305,14.5000,14.4542,0,1
1306,20.5000,14.4542,0,1
1307,26.5000,7.2250,1,0
1308,27.0000,7.2250,1,0


**2.Split data**

In [20]:
x2_train,x2_test,y_train,y_test=train_test_split(x2,y, test_size= 0.3, random_state= 123)

**3.train model**

In [21]:
model2= RandomForestClassifier(n_estimators= 100,oob_score= True,n_jobs= -1, random_state= 42 )

In [22]:
model2.fit(x2_train,y_train)

RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=42)

In [23]:
model2.oob_score_

0.7663755458515283

**4.Evaluate**

In [24]:
model2.score(x2_train,y_train)

0.9836244541484717

In [25]:
model2.score(x2_test,y_test)
# model performance doesn't decrease a lot even with less feature

0.7862595419847328

# Hyperparameter tuning for Random Forest using GridSearchCV 

In [26]:
model3= RandomForestClassifier(random_state=42,n_jobs=-1)

In [27]:
params= {
    'max_depth':[2,3,5,10,20], 
    'min_samples_leaf': [5,10,20,50,100,200], 
    'n_estimators':[50,100,200] # number of tree
}

In [28]:
grid_search= GridSearchCV(estimator= model3, 
                         param_grid=params,
                        cv=4,
                         n_jobs=-1,
                         verbose=1,
                         scoring="accuracy")

In [29]:
grid_search.fit(x2_train, y_train)

Fitting 4 folds for each of 90 candidates, totalling 360 fits


GridSearchCV(cv=4, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100, 200],
                         'n_estimators': [50, 100, 200]},
             scoring='accuracy', verbose=1)

In [30]:
grid_search.best_score_

0.7751091703056769

In [31]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=5, min_samples_leaf=10, n_estimators=200,
                       n_jobs=-1, random_state=42)

In [32]:
model3= RandomForestClassifier(max_depth=5, min_samples_leaf=10, n_estimators=200,
                       n_jobs=-1, random_state=42, oob_score= True)

In [33]:
model3.fit(x2_train,y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=10, n_estimators=200,
                       n_jobs=-1, oob_score=True, random_state=42)

In [34]:
model3.oob_score_

0.7729257641921398

In [35]:
model3.score(x2_train,y_train)

0.7980349344978166

In [36]:
model3.score(x2_test,y_test)

0.8015267175572519