In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [6]:
!ls /Users/paxton615/database/resource-datasets/titanic

titanic.csv       titanic_clean.csv titanic_test.csv  titanic_train.csv


In [7]:
titanic = pd.read_csv('/Users/paxton615/database/\
resource-datasets/titanic/titanic.csv')

In [9]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [10]:
x = titanic[['Pclass', 'Sex', 'Age', 'SibSp','Fare', ]]
y = titanic['Survived']

In [13]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  891 non-null    int64  
 1   Sex     891 non-null    object 
 2   Age     891 non-null    float64
 3   SibSp   891 non-null    int64  
 4   Fare    891 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [12]:
x.Age = x.Age.fillna(x.Age.mean())

#### get_dummies( )

In [14]:
x = pd.get_dummies(x, prefix_sep='_', drop_first=True)

In [15]:
x.head() # quicker than to_dict() + DictVectorizer

Unnamed: 0,Pclass,Age,SibSp,Fare,Sex_male
0,3,22.0,1,7.25,1
1,1,38.0,1,71.2833,0
2,3,26.0,0,7.925,0
3,1,35.0,1,53.1,0
4,3,35.0,0,8.05,1


#### train_test_split( )

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y,\
                                        random_state=17)

#### modeling

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
rfc = RandomForestClassifier(n_estimators=10, criterion='gini',\
        max_depth=9, max_features='auto')

In [20]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=9, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
print('train_score:', rfc.score(x_train, y_train))
print('test_score:', rfc.score(x_test, y_test))

train_score: 0.9191616766467066
test_score: 0.8385650224215246


#### GridSearchCV

In [22]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_dict = {
              'criterion':['gini','entropy'],
             'max_depth':[3,6,7,8,9,12],
             'max_features':['auto','log2'],
             'n_estimators':[50,100,200,300]}

In [27]:
gscv = GridSearchCV(RandomForestClassifier(),
                    param_grid=param_dict, cv = 5)

In [28]:
gscv.fit(x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [29]:
print('gscv_score:', gscv.score(x_test,y_test))

gscv_score: 0.8251121076233184


In [30]:
print('best_params:', gscv.best_params_)

best_params: {'bootstrap': 'False', 'criterion': 'gini', 'max_depth': 9, 'max_features': 'auto', 'n_estimators': 100}


In [31]:
print('best_score:', gscv.best_score_)

best_score: 0.8428122545168893


In [32]:
print('best_estimator:',gscv.best_estimator_)

best_estimator: RandomForestClassifier(bootstrap='False', ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=9, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [33]:
gscv.best_index_

171

#### Finalize modeling

In [34]:
rfc = RandomForestClassifier(bootstrap='False', n_estimators=100,
                            max_depth=9, criterion='gini',
                            max_features='auto')

In [35]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap='False', ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=9, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
print('train_score:', rfc.score(x_train, y_train))
print('test_score:', rfc.score(x_test, y_test))

train_score: 0.9326347305389222
test_score: 0.8340807174887892


#### cross validation

In [38]:
from sklearn.model_selection import cross_val_score

In [None]:
rfc = RandomForestClassifier(bootstrap='False', n_estimators=100,
                            max_depth=9, criterion='gini',
                            max_features='auto')

In [39]:
cv_score = cross_val_score(rfc, x_train, y_train, cv=10)
print(); print(cv_score)
print(); print(cv_score.mean())
print(); print(cv_score.std())


[0.74626866 0.86567164 0.85074627 0.86567164 0.88059701 0.89552239
 0.80597015 0.8358209  0.81818182 0.83333333]

0.8397783808231569

0.04079883404306287


In [40]:
cv_score = cross_val_score(rfc, x_test, y_test, cv=10)
print(); print(cv_score)
print(); print(cv_score.mean())
print(); print(cv_score.std())


[0.69565217 0.60869565 0.7826087  0.68181818 0.77272727 0.81818182
 0.81818182 0.72727273 0.77272727 0.81818182]

0.749604743083004

0.06642815499917124


#### cv with bootstrap changed to True

In [41]:
rfc = RandomForestClassifier(bootstrap=True, n_estimators=100,
                            max_depth=9, criterion='gini',
                            max_features='auto')

In [42]:
cv_score = cross_val_score(rfc, x_train, y_train, cv=10)
print(); print(cv_score)
print(); print(cv_score.mean())
print(); print(cv_score.std())


[0.74626866 0.85074627 0.8358209  0.88059701 0.88059701 0.88059701
 0.80597015 0.85074627 0.81818182 0.84848485]

0.8398009950248756

0.039603039648859285


In [43]:
cv_score = cross_val_score(rfc, x_test, y_test, cv=10)
print(); print(cv_score)
print(); print(cv_score.mean())
print(); print(cv_score.std())


[0.69565217 0.60869565 0.7826087  0.68181818 0.77272727 0.81818182
 0.77272727 0.72727273 0.77272727 0.77272727]

0.7405138339920948

0.05913055367661527


### GridSearchCV 2

In [44]:
param_dict = {
              'criterion':['gini','entropy'],
             'max_depth':[3,6,7,8,9,12],
             'max_features':['auto','log2'],
             'n_estimators':[50,100,200,300]}

In [45]:
gscv = GridSearchCV(RandomForestClassifier(),
                    param_grid=param_dict, cv = 10)

In [46]:
gscv.fit(x_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rand

# took 3 mins to calculate

In [29]:
print('gscv_score:', gscv.score(x_test,y_test))

gscv_score: 0.8251121076233184


In [47]:
print('gscv_score2:', gscv.score(x_test,y_test))

gscv_score2: 0.8251121076233184


In [30]:
print('best_params:', gscv.best_params_)

best_params: {'bootstrap': 'False', 'criterion': 'gini', 'max_depth': 9, 'max_features': 'auto', 'n_estimators': 100}


In [48]:
print('best_params2:', gscv.best_params_)

best_params2: {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 300}


In [31]:
print('best_score:', gscv.best_score_)

best_score: 0.8428122545168893


In [49]:
print('best_score2:', gscv.best_score_)

best_score2: 0.8443238353686114
