### Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:

In [1]:
import numpy as np
import pandas as pd

from pydataset import data

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

import acquire
import prepare

In [2]:
df = acquire.get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [3]:
df.isna().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [None]:
# turn sex strings into booleans
df["is_female"] = df.sex == "Female"

#and encode the embark_town variable as there are 4 options
embarked_dummies = pd.get_dummies(df.embarked, prefix='Embarked', drop_first=True)
embarked_dummies

#dummy_df = pd.get_dummies(df[["embarked"]], drop_first=True)
#dummy_df

In [None]:
df = pd.concat([df, embarked_dummies], axis=1)
df.head()

In [None]:
class_dummies = pd.get_dummies(df.pclass, prefix='class', drop_first=True)
class_dummies

In [None]:
df = pd.concat([df, class_dummies], axis=1)
df.head()

In [None]:
df = df.drop(columns=['deck', 'age', 'embark_town', 'passenger_id', 'embarked', 'sex', 'pclass', 'class'])
df.info()

In [None]:
df.head()

In [None]:
train_validate, test = train_test_split(df, test_size=0.2, random_state=3210, stratify=df.survived)
train, validate = train_test_split(train_validate, train_size=0.7, random_state=3210, stratify=train_validate.survived)

In [None]:
train.shape, validate.shape, test.shape

In [3]:
train, validate, test = prepare.prep_titanic(df)

In [4]:
train.shape, validate.shape, test.shape

((498, 10), (214, 10), (179, 10))

# 1. What is your baseline prediction? What is your baseline accuracy? 
remember: your baseline prediction for a classification problem is predicting the **most prevelant class** in the training dataset (*the mode*). 

When you make those predictions, 
- what is your accuracy? 
    - This is your baseline accuracy.

In [5]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [6]:
train['baseline_prediction'] = 0

In [7]:
train.head()

Unnamed: 0,survived,sibsp,parch,fare,alone,is_female,Embarked_Q,Embarked_S,class_2,class_3,baseline_prediction
429,1,0,0,8.05,1,False,0,1,0,1,0
465,0,0,0,7.05,1,False,0,1,0,1,0
157,0,0,0,8.05,1,False,0,1,0,1,0
212,0,0,0,7.25,1,False,0,1,0,1,0
324,0,8,2,69.55,0,False,0,1,0,1,0


In [8]:
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [9]:
baseline_accuracy = (train.survived == train.baseline_prediction).mean()
baseline_accuracy

0.6164658634538153

In [11]:
train = train.drop(columns=['baseline_prediction'])

In [12]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [13]:
train.head()

Unnamed: 0,survived,sibsp,parch,fare,alone,is_female,Embarked_Q,Embarked_S,class_2,class_3
429,1,0,0,8.05,1,False,0,1,0,1
465,0,0,0,7.05,1,False,0,1,0,1
157,0,0,0,8.05,1,False,0,1,0,1
212,0,0,0,7.25,1,False,0,1,0,1
324,0,8,2,69.55,0,False,0,1,0,1


# 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [14]:
# Let's generate a blank, new Decision Tree model
# Be sure to set the max_depth argument
# clf = DecisionTreeClassifier(max_depth=3, random_state=123)

clf = DecisionTreeClassifier(max_depth=3, random_state=3210)

In [15]:
# Now let's train our model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(max_depth=3, random_state=3210)

In [16]:
df.survived.unique()

array([0, 1])

In [15]:
# To programmatically specify the output class labels
# Visualize the model so it can explain itself!
# dataframe.target_variable.unique() then sort the array

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

# 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [17]:

# Now we'll make a SET OF PREDICTIONS using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array([0, 0, 0])

In [18]:
# ESTIMATE THE PROBABLITIES for each class
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:3]

array([[0.85106383, 0.14893617],
       [0.85106383, 0.14893617],
       [0.85106383, 0.14893617]])

In [19]:
# Model Score = 70%
# Baseline = 62%
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.70


In [20]:
pd.crosstab(y_train, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,289,18
1,132,59


In [21]:
(289 + 59) / (289 + 18 + 132 + 59)

0.6987951807228916

In [22]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.94      0.79       307
           1       0.77      0.31      0.44       191

    accuracy                           0.70       498
   macro avg       0.73      0.63      0.62       498
weighted avg       0.72      0.70      0.66       498



# 4. Compute: 
- Accuracy, 
- true positive rate, 
- false positive rate, 
- true negative rate, 
- false negative rate, 
- precision, 
- recall, 
- f1-score, and 
- support.

In [23]:
# Accuracy
accuracy_score(y_train, y_pred)

0.6987951807228916

In [24]:
# true positive rate (TP/TP+FN)
(289) / (289 + 18)

0.9413680781758957

In [25]:
# false positive rate (FP/FP+TN)
(132) / (59 + 132)

0.6910994764397905

In [26]:
# true negative rate (TN/TN+FP)
(59) / (59 + 132)

0.3089005235602094

In [27]:
# false negative rate (FN/FN+TP)
(18) / (18 + 132)

0.12

In [28]:
#precision
precision_score(y_train, y_pred)

0.7662337662337663

In [29]:
# recall
recall_score(y_train, y_pred)

0.3089005235602094

In [30]:
# F1 Score
f1_score(y_train, y_pred, average=None)

array([0.79395604, 0.44029851])

In [31]:
# Support
#307

# 5. Run through steps 2-4 using a different `max_depth` value.

In [32]:
clf1 = DecisionTreeClassifier(max_depth=5, random_state=3210)

In [33]:
clf1 = clf1.fit(X_train, y_train)
clf1

DecisionTreeClassifier(max_depth=5, random_state=3210)

In [34]:
dot_data = export_graphviz(clf1, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree1', view=True, format="pdf")

'titanic_decision_tree1.pdf'

In [35]:
y_pred1 = clf1.predict(X_train)
y_pred1[0:3]

array([0, 0, 0])

In [36]:
y_pred_proba1 = clf1.predict_proba(X_train)
y_pred_proba1[0:3]

array([[0.8245614 , 0.1754386 ],
       [0.87931034, 0.12068966],
       [0.8245614 , 0.1754386 ]])

In [37]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.75


In [38]:
pd.crosstab(y_train, y_pred1)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,284,23
1,102,89


In [39]:
conf = confusion_matrix(y_train, y_pred)
conf

array([[289,  18],
       [132,  59]])

In [40]:
print(classification_report(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.74      0.93      0.82       307
           1       0.79      0.47      0.59       191

    accuracy                           0.75       498
   macro avg       0.77      0.70      0.70       498
weighted avg       0.76      0.75      0.73       498



In [41]:
conf_df = pd.DataFrame(conf, columns=['pred_death', 'pred_survive'], index=['actual_death', 'actual_survive'])
conf_df

Unnamed: 0,pred_death,pred_survive
actual_death,289,18
actual_survive,132,59


In [42]:
rubric_df = pd.DataFrame([['TN', 'FP'], ['FN', 'TP']], columns=['pred_death', 'pred_survive'], index=['actual_death', 'actual_survive'])
rubric_df

Unnamed: 0,pred_death,pred_survive
actual_death,TN,FP
actual_survive,FN,TP


In [43]:
#create a function to calculate the metrics
def get_metrics_binary(clf):
    accuracy = clf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred1, output_dict=True)).T
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is: {accuracy:.4}
    The True Positive Rate is: {tpr:.3}
    The False Positive Rate is: {fpr:.3}
    The True Negative Rate is: {tnr:.3}
    The False Negative Rate is: {fnr:.3}
    
    
    ''')
    return class_report

In [44]:
get_metrics_binary(clf1)


    The accuracy for our model is: 0.749
    The True Positive Rate is: 0.309
    The False Positive Rate is: 0.0586
    The True Negative Rate is: 0.941
    The False Negative Rate is: 0.691
    
    
    


Unnamed: 0,precision,recall,f1-score,support
0,0.735751,0.925081,0.819625,307.0
1,0.794643,0.465969,0.587459,191.0
accuracy,0.748996,0.748996,0.748996,0.748996
macro avg,0.765197,0.695525,0.703542,498.0
weighted avg,0.758338,0.748996,0.730581,498.0


# 6. Which model performs better on your in-sample data?

In [None]:
#second model is better for all but recall and f1-score

# 7. Which model performs best on your out-of-sample data, the `validate` set?

In [45]:
validate.head()

Unnamed: 0,survived,sibsp,parch,fare,alone,is_female,Embarked_Q,Embarked_S,class_2,class_3
401,0,0,0,8.05,1,False,0,1,0,1
661,0,0,0,7.225,1,False,0,0,0,1
246,0,0,0,7.775,1,False,0,1,0,1
316,1,1,0,26.0,0,False,0,1,1,0
222,0,0,0,8.05,1,False,0,1,0,1


In [46]:
validate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214 entries, 401 to 500
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   survived    214 non-null    int64  
 1   sibsp       214 non-null    int64  
 2   parch       214 non-null    int64  
 3   fare        214 non-null    float64
 4   alone       214 non-null    int64  
 5   is_female   214 non-null    bool   
 6   Embarked_Q  214 non-null    uint8  
 7   Embarked_S  214 non-null    uint8  
 8   class_2     214 non-null    uint8  
 9   class_3     214 non-null    uint8  
dtypes: bool(1), float64(1), int64(4), uint8(4)
memory usage: 11.1 KB


In [47]:
validate.shape

(214, 10)

In [48]:
# true positive rate
recall_score(y_train, y_pred)

0.3089005235602094

In [51]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf.score(X_validate, y_validate)

0.6822429906542056

In [52]:

# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.68


In [53]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)
y_pred[0:3]

array([0, 0, 0])

In [54]:
y_validate.head(3)

401    0
661    0
246    0
Name: survived, dtype: int64

In [55]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.92      0.78       132
           1       0.69      0.30      0.42        82

    accuracy                           0.68       214
   macro avg       0.69      0.61      0.60       214
weighted avg       0.69      0.68      0.64       214



In [56]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf1.score(X_validate, y_validate)

0.6962616822429907

In [57]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf1.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.70


In [58]:
# Use the classification model trained on train data to make predictions on validate data
y_pred1 = clf1.predict(X_validate)
y_pred1[0:3]

array([0, 0, 0])

In [59]:
y_validate.head(3)

401    0
661    0
246    0
Name: survived, dtype: int64

In [60]:
# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred1))

              precision    recall  f1-score   support

           0       0.70      0.89      0.78       132
           1       0.68      0.39      0.50        82

    accuracy                           0.70       214
   macro avg       0.69      0.64      0.64       214
weighted avg       0.69      0.70      0.67       214



# RANDOM FOREST MODELING
Continue working in your `model` file with `titanic` data to do the following:

In [61]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from pydataset import data

# read Iris data from pydatset
df = acquire.get_titanic_data()

df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [62]:
train, validate, test = prepare.prep_titanic(df)

In [63]:
train.shape, validate.shape, test.shape

((498, 10), (214, 10), (179, 10))

# 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [None]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

#### Train Validate Test
Now we'll do our train/validate/test split:

- We'll do exploration and train our model on the `train` data

- We tune our model on `validate`, since it will be out-of-sample until we use it.

- And keep the `test` nice and safe and separate, for our final out-of-sample dataset, to see how well our tuned model performs on new data.



In [93]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            n_estimators=100,
                            max_depth=10, 
                            random_state=3210)

In [94]:
# fit the model
rf = rf.fit(X_train, y_train)
rf

RandomForestClassifier(max_depth=10, random_state=3210)

In [95]:
# evaluate importance
print(rf.feature_importances_)

[0.08579331 0.0792792  0.60725724 0.03545905 0.         0.02219781
 0.06296663 0.02769278 0.07935398]


In [96]:
# visualize the importance weight
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
fare,0.607257
sibsp,0.085793
class_3,0.079354
parch,0.079279
Embarked_S,0.062967
alone,0.035459
class_2,0.027693
Embarked_Q,0.022198
is_female,0.0


In [97]:
# make predictions of each feature
y_pred = rf.predict(X_train)

In [98]:
# Estimate probability of each feature
y_pred_proba = rf.predict_proba(X_train)

# 2. Evaluate your results using the model score, confusion matrix, and classification report. 

In [99]:
# commpute acccuracy
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.86


In [100]:
# create confusion matrix
pd.crosstab(y_train, y_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,286,21
1,49,142


In [101]:
confm = confusion_matrix(y_train, y_pred)
confm

array([[286,  21],
       [ 49, 142]])

In [102]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.89       307
           1       0.87      0.74      0.80       191

    accuracy                           0.86       498
   macro avg       0.86      0.84      0.85       498
weighted avg       0.86      0.86      0.86       498



# 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [103]:
#create a function to calculate the metrics
def get_metrics(rf):
    accuracy = rf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    tpr = confm[1][1] / confm[1].sum()
    fpr = confm[0][1] / confm[0].sum()
    tnr = confm[0][0] / confm[0].sum()
    fnr = confm[1][0] / confm[1].sum()
    print(f'''
    The accuracy for our model is: {accuracy:.2%}
    The True Positive Rate is: {tpr:.2%}
    The False Positive Rate is: {fpr:.2%}
    The True Negative Rate is: {tnr:.2%}
    The False Negative Rate is: {fnr:.2%}
    
    
    ''')
    return class_report

In [104]:
get_metrics(rf)


    The accuracy for our model is: 85.94%
    The True Positive Rate is: 74.35%
    The False Positive Rate is: 6.84%
    The True Negative Rate is: 93.16%
    The False Negative Rate is: 25.65%
    
    
    


Unnamed: 0,precision,recall,f1-score,support
0,0.853731,0.931596,0.890966,307.0
1,0.871166,0.743455,0.80226,191.0
accuracy,0.859438,0.859438,0.859438,0.859438
macro avg,0.862448,0.837526,0.846613,498.0
weighted avg,0.860418,0.859438,0.856944,498.0


# 4. Run through steps 1-3 increasing your min_samples_leaf and decreasing your max_depth.

In [105]:
#create the Random Forest object , this time adjusting the hyperparameters
# INCREASE min_sample_leaf
# DECREASE max_depth
rf1 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=10,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=3210)

In [107]:
# fit the model
rf1 = rf1.fit(X_train, y_train)
rf1

RandomForestClassifier(max_depth=3, min_samples_leaf=10, random_state=3210)

In [108]:
# visualize the importance weight
feature_importances1 = pd.DataFrame(rf1.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances1

Unnamed: 0,importance
fare,0.443418
class_3,0.206298
Embarked_S,0.103158
sibsp,0.077537
alone,0.07689
parch,0.048591
class_2,0.026212
Embarked_Q,0.017896
is_female,0.0


In [109]:
#make predictions
y_pred1 = rf1.predict(X_train)

In [112]:
# estimate probability: 
# since we only have two options (0 or 1)
# we have two columns and 891 rows (passengers)
# each passenger has the probablity of 0 or 1
# under 0.5 = 0 = death
# above 0.1 = 1 = survived

y_pred_proba1 = rf1.predict_proba(X_train)
y_pred_proba1[0:3]

array([[0.78789642, 0.21210358],
       [0.80595569, 0.19404431],
       [0.78789642, 0.21210358]])

In [None]:
# compute accuracy


In [113]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf1.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.71


In [114]:
confm1 = confusion_matrix(y_train, y_pred1)
confm1

array([[267,  40],
       [104,  87]])

In [121]:
#create a function to calculate the metrics
def get_metrics1(rf):
    accuracy = rf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred1, output_dict=True)).T
    tpr = confm1[1][1] / confm1[1].sum()
    fpr = confm1[0][1] / confm1[0].sum()
    tnr = confm1[0][0] / confm1[0].sum()
    fnr = confm1[1][0] / confm1[1].sum()
    print(f'''
    The accuracy for our model is: {accuracy:.2%}
    The True Positive Rate is: {tpr:.2%}
    The False Positive Rate is: {fpr:.2%}
    The True Negative Rate is: {tnr:.2%}
    The False Negative Rate is: {fnr:.2%}
    
    
    ''')
    return class_report

In [122]:
get_metrics1(rf1)


    The accuracy for our model is: 71.08%
    The True Positive Rate is: 45.55%
    The False Positive Rate is: 13.03%
    The True Negative Rate is: 86.97%
    The False Negative Rate is: 54.45%
    
    
    


Unnamed: 0,precision,recall,f1-score,support
0,0.719677,0.869707,0.787611,307.0
1,0.685039,0.455497,0.54717,191.0
accuracy,0.710843,0.710843,0.710843,0.710843
macro avg,0.702358,0.662602,0.66739,498.0
weighted avg,0.706392,0.710843,0.695393,498.0


# 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [116]:
get_metrics(rf)


    The accuracy for our model is: 85.94%
    The True Positive Rate is: 74.35%
    The False Positive Rate is: 6.84%
    The True Negative Rate is: 93.16%
    The False Negative Rate is: 25.65%
    
    
    


Unnamed: 0,precision,recall,f1-score,support
0,0.853731,0.931596,0.890966,307.0
1,0.871166,0.743455,0.80226,191.0
accuracy,0.859438,0.859438,0.859438,0.859438
macro avg,0.862448,0.837526,0.846613,498.0
weighted avg,0.860418,0.859438,0.856944,498.0


In [123]:
get_metrics1(rf1)


    The accuracy for our model is: 71.08%
    The True Positive Rate is: 45.55%
    The False Positive Rate is: 13.03%
    The True Negative Rate is: 86.97%
    The False Negative Rate is: 54.45%
    
    
    


Unnamed: 0,precision,recall,f1-score,support
0,0.719677,0.869707,0.787611,307.0
1,0.685039,0.455497,0.54717,191.0
accuracy,0.710843,0.710843,0.710843,0.710843
macro avg,0.702358,0.662602,0.66739,498.0
weighted avg,0.706392,0.710843,0.695393,498.0


### The first RF model is better (min_samples_leaf at default 1 & max_depth = 10)
### Why?
- I think this is due to the end node being minimized.

# Create a third model (my choice) with default min_samples_leaf and max_depth = 3

In [125]:
#create the Random Forest object 
rf2 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=3210)

In [126]:
# fit the model
rf2 = rf2.fit(X_train, y_train)
rf2

RandomForestClassifier(max_depth=3, random_state=3210)

In [127]:
# visualize the importance weight
feature_importances2 = pd.DataFrame(rf2.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances2

Unnamed: 0,importance
fare,0.417305
class_3,0.20345
Embarked_S,0.104492
sibsp,0.087175
alone,0.078015
parch,0.056534
class_2,0.031452
Embarked_Q,0.021579
is_female,0.0


In [128]:
#make predictions
y_pred2 = rf2.predict(X_train)

In [129]:
# estimate probability: 
# since we only have two options (0 or 1)
# we have two columns and 891 rows (passengers)
# each passenger has the probablity of 0 or 1
# under 0.5 = 0 = death
# above 0.1 = 1 = survived

y_pred_proba2 = rf2.predict_proba(X_train)
y_pred_proba2[0:3]

array([[0.78778877, 0.21221123],
       [0.80206178, 0.19793822],
       [0.78778877, 0.21221123]])

In [130]:
# compute accuracy
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.72


In [131]:
# create confusion matrix
confm2 = confusion_matrix(y_train, y_pred2)
confm2

array([[268,  39],
       [101,  90]])

In [132]:
#create a function to calculate the metrics
def get_metrics2(rf):
    accuracy = rf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred2, output_dict=True)).T
    tpr = confm2[1][1] / confm2[1].sum()
    fpr = confm2[0][1] / confm2[0].sum()
    tnr = confm2[0][0] / confm2[0].sum()
    fnr = confm2[1][0] / confm2[1].sum()
    print(f'''
    The accuracy for our model is: {accuracy:.2%}
    The True Positive Rate is: {tpr:.2%}
    The False Positive Rate is: {fpr:.2%}
    The True Negative Rate is: {tnr:.2%}
    The False Negative Rate is: {fnr:.2%}
    
    
    ''')
    return class_report

In [133]:
get_metrics2(rf2)


    The accuracy for our model is: 71.89%
    The True Positive Rate is: 47.12%
    The False Positive Rate is: 12.70%
    The True Negative Rate is: 87.30%
    The False Negative Rate is: 52.88%
    
    
    


Unnamed: 0,precision,recall,f1-score,support
0,0.726287,0.872964,0.792899,307.0
1,0.697674,0.471204,0.5625,191.0
accuracy,0.718876,0.718876,0.718876,0.718876
macro avg,0.711981,0.672084,0.6777,498.0
weighted avg,0.715313,0.718876,0.704533,498.0


In [134]:
get_metrics1(rf1)


    The accuracy for our model is: 71.08%
    The True Positive Rate is: 45.55%
    The False Positive Rate is: 13.03%
    The True Negative Rate is: 86.97%
    The False Negative Rate is: 54.45%
    
    
    


Unnamed: 0,precision,recall,f1-score,support
0,0.719677,0.869707,0.787611,307.0
1,0.685039,0.455497,0.54717,191.0
accuracy,0.710843,0.710843,0.710843,0.710843
macro avg,0.702358,0.662602,0.66739,498.0
weighted avg,0.706392,0.710843,0.695393,498.0


In [135]:
get_metrics(rf)


    The accuracy for our model is: 85.94%
    The True Positive Rate is: 74.35%
    The False Positive Rate is: 6.84%
    The True Negative Rate is: 93.16%
    The False Negative Rate is: 25.65%
    
    
    


Unnamed: 0,precision,recall,f1-score,support
0,0.853731,0.931596,0.890966,307.0
1,0.871166,0.743455,0.80226,191.0
accuracy,0.859438,0.859438,0.859438,0.859438
macro avg,0.862448,0.837526,0.846613,498.0
weighted avg,0.860418,0.859438,0.856944,498.0


In [139]:
# make predictions of each feature
y_predv = rf.predict(X_validate)

In [140]:
y_predv1 = rf1.predict(X_validate)

In [141]:
y_predv2 = rf2.predict(X_validate)

In [98]:
# Estimate probability of each feature
y_pred_probav = rf.predict_proba(X_validate)

## After making a few models, which one has the best performance (or closest metrics) on both train and validate?

### Evaluate on out-of-sample data

In [142]:
#create a function to calculate the metrics
def valid_metrics(rf, y_predv):
    accuracy = rf.score(X_validate, y_validate)
    class_report = pd.DataFrame(classification_report(y_validate, y_predv, output_dict=True)).T
    #tpr = confm2[1][1] / confm2[1].sum()
    #fpr = confm2[0][1] / confm2[0].sum()
    #tnr = confm2[0][0] / confm2[0].sum()
    #fnr = confm2[1][0] / confm2[1].sum()
    print(f'''
    The accuracy for our model is: {accuracy:.2%}
    ''')
    return class_report

In [143]:
valid_metrics(rf, y_predv)


    The accuracy for our model is: 66.36%
    


Unnamed: 0,precision,recall,f1-score,support
0,0.702703,0.787879,0.742857,132.0
1,0.575758,0.463415,0.513514,82.0
accuracy,0.663551,0.663551,0.663551,0.663551
macro avg,0.63923,0.625647,0.628185,214.0
weighted avg,0.65406,0.663551,0.654978,214.0


In [144]:
valid_metrics(rf1, y_predv1)


    The accuracy for our model is: 69.63%
    


Unnamed: 0,precision,recall,f1-score,support
0,0.721854,0.825758,0.770318,132.0
1,0.634921,0.487805,0.551724,82.0
accuracy,0.696262,0.696262,0.696262,0.696262
macro avg,0.678387,0.656781,0.661021,214.0
weighted avg,0.688543,0.696262,0.686558,214.0


In [145]:
valid_metrics(rf2, y_predv2)


    The accuracy for our model is: 70.09%
    


Unnamed: 0,precision,recall,f1-score,support
0,0.723684,0.833333,0.774648,132.0
1,0.645161,0.487805,0.555556,82.0
accuracy,0.700935,0.700935,0.700935,0.700935
macro avg,0.684423,0.660569,0.665102,214.0
weighted avg,0.693596,0.700935,0.690697,214.0
