# Random Forest

In [33]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import acquire
import prepare
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from pydataset import data

In [7]:
# read Titanic data from acquire
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
df = df.drop(columns=['pclass', 'passenger_id', 'sex', 'embark_town'])
df.head()

Unnamed: 0,survived,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,1,0,7.25,0,0,1,0,0,1
1,1,1,0,71.2833,0,1,0,1,0,0
2,1,0,0,7.925,1,1,0,0,0,1
3,1,1,0,53.1,0,1,0,0,0,1
4,0,0,0,8.05,1,0,1,0,0,1


In [8]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [10]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df, target='survived', seed=123)

# Explore your data here. 

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [18]:
# creating randomforest object

rf = RandomForestClassifier(max_depth=10, min_samples_leaf = 1,
                            random_state=123)

In [19]:
rf

RandomForestClassifier(max_depth=10, random_state=123)

## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [20]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [21]:
y_pred = rf.predict(X_train)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,

## 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [22]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


In [23]:
print(confusion_matrix(y_train, y_pred))

[[302   5]
 [ 24 167]]


In [24]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95       307
           1       0.97      0.87      0.92       191

    accuracy                           0.94       498
   macro avg       0.95      0.93      0.94       498
weighted avg       0.94      0.94      0.94       498



## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [27]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()

In [28]:
negative_cases = TN + FP
positive_cases = FN + TP
print(f"Negative Cases: {negative_cases}")
print(f"Positive Cases: {positive_cases}")
print(y_train.value_counts())

Negative Cases: 307
Positive Cases: 191
0    307
1    191
Name: survived, dtype: int64


In [29]:
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")

Accuracy: 0.9417670682730924 

True Positive Rate/Sensitivity/Recall/Power: 0.8743455497382199 

False Positive Rate/False Alarm Ratio/Fall-out: 0.016286644951140065 

True Negative Rate/Specificity/Selectivity: 0.9837133550488599 

False Negative Rate/Miss Rate: 0.1256544502617801 

Precision/PPV: 0.9709302325581395 

F1 Score: 0.9201101928374656 

Support (0): 307 

Support (1): 191


## 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [36]:
# Let's get loopy
for i in range(1, 10):
    for j in range(1, 10):
    # Make the model
        rf = RandomForestClassifier(max_depth=i,min_samples_leaf = j, random_state=123)

    # Fit the model (on train and only train)
        rf = rf.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
        y_predictions = rf.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
        report = classification_report(y_train, y_predictions, output_dict=True)
        print(f"Rf with max depth of {i}", '\n')
        print(f"Rf with min samples leaf of {j}", '\n')
        print(pd.DataFrame(report))
        print()

Rf with max depth of 1 

Rf with min samples leaf of 1 

                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Rf with max depth of 1 

Rf with min samples leaf of 2 

                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217      0.797255
recall       0.863192    0.696335  0.799197    0.779764      0.799197
f1-score     0.841270    0.726776  0.799197    0.784023      0.797358
support    307.000000  191.000000  0.799197  498.000000    498.000000

Rf with max depth of 1 

Rf with min samples leaf of 3 

                    0           1  accuracy   macro avg  weighted avg
precision    0.820433    0.760000  0.799197    0.790217  

Rf with max depth of 3 

Rf with min samples leaf of 5 

                    0           1  accuracy   macro avg  weighted avg
precision    0.822823    0.800000  0.815261    0.811411      0.814069
recall       0.892508    0.691099  0.815261    0.791804      0.815261
f1-score     0.856250    0.741573  0.815261    0.798912      0.812267
support    307.000000  191.000000  0.815261  498.000000    498.000000

Rf with max depth of 3 

Rf with min samples leaf of 6 

                    0           1  accuracy   macro avg  weighted avg
precision    0.822823    0.800000  0.815261    0.811411      0.814069
recall       0.892508    0.691099  0.815261    0.791804      0.815261
f1-score     0.856250    0.741573  0.815261    0.798912      0.812267
support    307.000000  191.000000  0.815261  498.000000    498.000000

Rf with max depth of 3 

Rf with min samples leaf of 7 

                    0           1  accuracy   macro avg  weighted avg
precision    0.823708    0.786982  0.811245    0.805345  

Rf with max depth of 5 

Rf with min samples leaf of 9 

                    0           1  accuracy   macro avg  weighted avg
precision    0.823353    0.804878  0.817269    0.814116      0.816267
recall       0.895765    0.691099  0.817269    0.793432      0.817269
f1-score     0.858034    0.743662  0.817269    0.800848      0.814169
support    307.000000  191.000000  0.817269  498.000000    498.000000

Rf with max depth of 6 

Rf with min samples leaf of 1 

                    0           1  accuracy   macro avg  weighted avg
precision    0.863501    0.900621  0.875502    0.882061      0.877738
recall       0.947883    0.759162  0.875502    0.853523      0.875502
f1-score     0.903727    0.823864  0.875502    0.863795      0.873096
support    307.000000  191.000000  0.875502  498.000000    498.000000

Rf with max depth of 6 

Rf with min samples leaf of 2 

                    0           1  accuracy   macro avg  weighted avg
precision    0.849558    0.880503  0.859438    0.865030  

Rf with max depth of 8 

Rf with min samples leaf of 4 

                    0           1  accuracy   macro avg  weighted avg
precision    0.847561    0.829412  0.841365    0.838486      0.840600
recall       0.905537    0.738220  0.841365    0.821879      0.841365
f1-score     0.875591    0.781163  0.841365    0.828377      0.839375
support    307.000000  191.000000  0.841365  498.000000    498.000000

Rf with max depth of 8 

Rf with min samples leaf of 5 

                    0           1  accuracy   macro avg  weighted avg
precision    0.837838    0.830303  0.835341    0.834070      0.834948
recall       0.908795    0.717277  0.835341    0.813036      0.835341
f1-score     0.871875    0.769663  0.835341    0.820769      0.832673
support    307.000000  191.000000  0.835341  498.000000    498.000000

Rf with max depth of 8 

Rf with min samples leaf of 6 

                    0           1  accuracy   macro avg  weighted avg
precision    0.830357    0.827160  0.829317    0.828759  

In [37]:
# max depth of 9, min samples leaf of 1 has the highest accuracy

In [38]:
rf = RandomForestClassifier(max_depth=9, min_samples_leaf = 1,
                            random_state=123)

In [39]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, random_state=123)

In [40]:
y_pred = rf.predict(X_train)

In [41]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.94


In [42]:
print(confusion_matrix(y_train, y_pred))

[[301   6]
 [ 24 167]]


print(classification_report(y_train, y_pred))