In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

import acquire
import preprocess
import prepare

Create a new notebook, random_forests, and work with titanic data to do the following:

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

Evaluate your results using the model score, confusion matrix, and classification report.

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

Run through steps increasing your min_samples_leaf and decreasing your max_depth.

What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [2]:
train, validate, test = prepare.splitting_data(prepare.prep_titanic(acquire.get_titanic_db()), 'survived')

this file exists, reading from csv file


In [3]:
train.head(3)

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
776,776,0,3,male,0,0,7.75,Queenstown,1
829,829,1,1,female,0,0,80.0,Southampton,1
215,215,1,1,female,1,0,113.275,Cherbourg,0


In [4]:
train_encoded, val_encoded, test_encoded = preprocess.preprocess_titanic(train, validate, test)

In [5]:
train_encoded.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,embark_town_Queenstown,embark_town_Southampton,sex_male
776,0,3,0,0,7.75,1,1,0,1
829,1,1,0,0,80.0,1,0,1,0
215,1,1,1,0,113.275,0,0,0,0
258,1,1,0,0,512.3292,1,0,0,0
129,0,3,0,0,6.975,1,0,1,1


In [6]:
y_train = train_encoded.survived
y_validate = val_encoded.survived
y_test = test_encoded.survived

In [7]:
X_train = train_encoded.drop(columns='survived')
X_validate = val_encoded.drop(columns='survived')
X_test = test_encoded.drop(columns='survived')

Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

Evaluate your results using the model score, confusion matrix, and classification report.

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [9]:
rf = RandomForestClassifier(max_depth=10, min_samples_leaf=1)

In [10]:
rf.fit(X_train, y_train)

In [12]:
y_pred = rf.predict(X_train)
y_pred[:10]

array([0, 1, 1, 1, 0, 0, 1, 0, 1, 0])

> Evaluate your results using the model score, confusion matrix, and classification report.

In [13]:
rf.score(X_train, y_train)

0.9456928838951311

In [15]:
confusion_matrix(y_train, y_pred)

array([[322,   7],
       [ 22, 183]])

In [19]:
count = pd.crosstab(y_train, y_pred)
count

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,322,7
1,22,183


> positive calculation: 'survived' : (1)

In [20]:
tp = count.iloc[1,1]
tn = count.iloc[0,0]
fp = count.iloc[0,1]
fn = count.iloc[1,0]

In [21]:
tp, tn, fp, fn

(183, 322, 7, 22)

Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [23]:
def compute_class_metrics(y_train, y_pred):
    '''
    counts = pd.crosstab(y_train, y_pred)
    TP = counts.iloc[1,1]
    TN = counts.iloc[0,0]
    FP = counts.iloc[0,1]
    FN = counts.iloc[1,0]
    
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN
    
    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

In [26]:
compute_class_metrics(y_train, y_pred)

Accuracy: 0.9456928838951311

True Positive Rate/Sensitivity/Recall/Power: 0.8926829268292683
False Positive Rate/False Alarm Ratio/Fall-out: 0.02127659574468085
True Negative Rate/Specificity/Selectivity: 0.9787234042553191
False Negative Rate/Miss Rate: 0.1073170731707317

Precision/PPV: 0.9631578947368421
F1 Score: 0.9265822784810127

Support (0): 205
Support (1): 329


In [28]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96       329
           1       0.96      0.89      0.93       205

    accuracy                           0.95       534
   macro avg       0.95      0.94      0.94       534
weighted avg       0.95      0.95      0.95       534



Run through steps increasing your min_samples_leaf and decreasing your max_depth.

What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [34]:
for x in range(1,11):
    rf = RandomForestClassifier(min_samples_leaf=x, max_depth=11-x, random_state=123)

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_train)

    accuracy = rf.score(X_train, y_train)
    
    print(f'for min leaf samples = {x} and max depth = {11-x}, the accuracy is {round(accuracy,2)}')

for min leaf samples = 1 and max depth = 10, the accuracy is 0.95
for min leaf samples = 2 and max depth = 9, the accuracy is 0.9
for min leaf samples = 3 and max depth = 8, the accuracy is 0.87
for min leaf samples = 4 and max depth = 7, the accuracy is 0.86
for min leaf samples = 5 and max depth = 6, the accuracy is 0.86
for min leaf samples = 6 and max depth = 5, the accuracy is 0.84
for min leaf samples = 7 and max depth = 4, the accuracy is 0.83
for min leaf samples = 8 and max depth = 3, the accuracy is 0.81
for min leaf samples = 9 and max depth = 2, the accuracy is 0.79
for min leaf samples = 10 and max depth = 1, the accuracy is 0.78


> Model max depth 10 with minimum leaf sample 1 is best with the accuracy of 95%

In [38]:
stats = []

for x in range(1,11):
    rf = RandomForestClassifier(min_samples_leaf=x, max_depth=11-x, random_state=123)

    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_train)

    accuracy = rf.score(X_train, y_train)
    accuracy_validate = rf.score(X_validate, y_validate)
    
    stats.append([x, 11-x, round(accuracy,2),round(acc_val,2) ]) 
    
    print(f'min_leaf_samples = {x}, max_depth = {11-x}, accuracy_train = {round(accuracy, 2)}, validate_accuracy = {round(accuracy_validate,2)}')

min_leaf_samples = 1, max_depth = 10, accuracy_train = 0.95, validate_accuracy = 0.76
min_leaf_samples = 2, max_depth = 9, accuracy_train = 0.9, validate_accuracy = 0.75
min_leaf_samples = 3, max_depth = 8, accuracy_train = 0.87, validate_accuracy = 0.75
min_leaf_samples = 4, max_depth = 7, accuracy_train = 0.86, validate_accuracy = 0.74
min_leaf_samples = 5, max_depth = 6, accuracy_train = 0.86, validate_accuracy = 0.74
min_leaf_samples = 6, max_depth = 5, accuracy_train = 0.84, validate_accuracy = 0.74
min_leaf_samples = 7, max_depth = 4, accuracy_train = 0.83, validate_accuracy = 0.75
min_leaf_samples = 8, max_depth = 3, accuracy_train = 0.81, validate_accuracy = 0.76
min_leaf_samples = 9, max_depth = 2, accuracy_train = 0.79, validate_accuracy = 0.72
min_leaf_samples = 10, max_depth = 1, accuracy_train = 0.78, validate_accuracy = 0.73


In [25]:
rf.fit(X_train[['sex_male', 'pclass', 'fare']], y_train)

In [27]:
rf_features = [['sex_male', 'pclass', 'fare']]
