# Random Forest Exercises

## Create a new notebook, random_forests, and work with titanic data to do the following:

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import acquire as ac
import prepare as prep

seed = 100

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [2]:
titanic = prep.titanic()
train, val, test = prep.train_val_test(titanic, strat='survived')
x_train, y_train = prep.split_x_y(train, 'survived')
x_val, y_val = prep.split_x_y(val, 'survived')

In [3]:
prep.baseline(train, 'survived', 1)  # Baseline scores

Baseline accuracy is: 61.64%.
Baseline recall is: 0.0%.
Baseline precision is: 0.0%.



In [4]:
rf1 = RandomForestClassifier(max_depth=10, min_samples_leaf=1, random_state=seed)
rf1.fit(x_train, y_train)

In [5]:
rf1_pred = rf1.predict(x_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [6]:
prep.evaluate(train,'survived', rf1_pred,1)

Model accuracy is: 94.86%.
Model recall is: 88.7%.
Model precision is: 97.7%.


In [7]:
rf1.score(x_train,y_train)

0.9486356340288925

In [8]:
pd.crosstab(y_train, rf1_pred)

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,379,5
1,27,212


In [9]:
# TP:212  TN:379  FP:5  FN:27

In [10]:
print(classification_report(y_train, rf1_pred))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96       384
           1       0.98      0.89      0.93       239

    accuracy                           0.95       623
   macro avg       0.96      0.94      0.94       623
weighted avg       0.95      0.95      0.95       623



### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [11]:
# Done above

In [12]:
prep.importance(x_train, rf1)

Unnamed: 0,cols,importance
5,sex_male,0.295157
0,age,0.228031
3,fare,0.206306
1,sibsp,0.051978
6,class_First,0.050077
8,class_Third,0.046987
2,parch,0.038519
11,embark_town_Southampton,0.020958
4,alone,0.020774
7,class_Second,0.017502


### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [13]:
x = range(1,11)
l = 10
model_list = []
scores = []
depth_list = []
samples = []

for n in x:
    model = RandomForestClassifier(max_depth=l, min_samples_leaf=n, random_state=seed)
    model.fit(x_train, y_train)
    model_list.append(f' Model {n}')
    scores.append(model.score(x_train,y_train))
    depth_list.append(l)
    samples.append(n)
    l -= 1
    
models = pd.DataFrame({'model':model_list,
              'score':scores,
              'depth':depth_list,
              'min samples leaf':samples})

models

Unnamed: 0,model,score,depth,min samples leaf
0,Model 1,0.948636,10,1
1,Model 2,0.894061,9,2
2,Model 3,0.874799,8,3
3,Model 4,0.863563,7,4
4,Model 5,0.855538,6,5
5,Model 6,0.836276,5,6
6,Model 7,0.826645,4,7
7,Model 8,0.82504,3,8
8,Model 9,0.791332,2,9
9,Model 10,0.75923,1,10


### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [17]:
# The model with the most depth seems to perform better on TRAINING data. 

## After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [15]:
x = range(1,11)
l = 10
model_list = []
scores_train = []
scores_val = []
depth_list = []
samples = []

for n in x:
    model = RandomForestClassifier(max_depth=l, min_samples_leaf=n, random_state=seed)
    model.fit(x_train, y_train)
    model_list.append(f' Model {n}')
    scores_train.append(model.score(x_train,y_train))
    scores_val.append(model.score(x_val,y_val))
    depth_list.append(l)
    samples.append(n)
    predictions = model.predict(x_train)
    #prep.evaluate(train,'survived', predictions, target=1)
    #print()
    l -= 1
    
models = pd.DataFrame({'model':model_list,
                       'score train':scores_train,
                       'score val':scores_val,
                       'depth':depth_list,
                       'min samples leaf':samples})

models

Unnamed: 0,model,score train,score val,depth,min samples leaf
0,Model 1,0.948636,0.798507,10,1
1,Model 2,0.894061,0.80597,9,2
2,Model 3,0.874799,0.813433,8,3
3,Model 4,0.863563,0.80597,7,4
4,Model 5,0.855538,0.791045,6,5
5,Model 6,0.836276,0.813433,5,6
6,Model 7,0.826645,0.828358,4,7
7,Model 8,0.82504,0.820896,3,8
8,Model 9,0.791332,0.783582,2,9
9,Model 10,0.75923,0.738806,1,10


In [16]:
x = range(1,11)
l = 10
model_list = []
scores_train = []
scores_val = []
depth_list = []
samples = []

for n in x:
    model = RandomForestClassifier(max_depth=l, min_samples_leaf=n, random_state=seed)
    model.fit(x_train, y_train)
    model_list.append(f' Model {n}')
    scores_train.append(model.score(x_train,y_train))
    scores_val.append(model.score(x_val,y_val))
    depth_list.append(n)
    samples.append(n)
    l -= 1
    
models = pd.DataFrame({'model':model_list,
                       'score train':scores_train,
                       'score val':scores_val,
                       'depth':depth_list,
                       'min samples leaf':samples})

models

Unnamed: 0,model,score train,score val,depth,min samples leaf
0,Model 1,0.948636,0.798507,1,1
1,Model 2,0.894061,0.80597,2,2
2,Model 3,0.874799,0.813433,3,3
3,Model 4,0.863563,0.80597,4,4
4,Model 5,0.855538,0.791045,5,5
5,Model 6,0.836276,0.813433,6,6
6,Model 7,0.826645,0.828358,7,7
7,Model 8,0.82504,0.820896,8,8
8,Model 9,0.791332,0.783582,9,9
9,Model 10,0.75923,0.738806,10,10
