In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import acquire
import prepare

# Exercises

#### Create a new notebook, random_forests, and work with titanic data to do the following:



In [2]:
df = acquire.get_titanic_data()

Found CSV


In [3]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
df = prepare.prep_titanic(df)

In [5]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
0,0,0,3,male,1,0,7.25,S,0,1,0,1
1,1,1,1,female,1,0,71.2833,C,0,0,0,0
2,2,1,3,female,0,0,7.925,S,1,0,0,1
3,3,1,1,female,1,0,53.1,S,0,0,0,1
4,4,0,3,male,0,0,8.05,S,1,1,0,1


In [6]:
train, validate, test = prepare.split_function(df, 'survived')

In [7]:
print(f'Train: {train.shape}')
print(f'Validate: {validate.shape}')
print(f'Test: {test.shape}')

Train: (534, 12)
Validate: (178, 12)
Test: (179, 12)


In [8]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
455,455,1,3,male,0,0,7.8958,C,1,1,0,0
380,380,1,1,female,0,0,227.525,C,1,0,0,0
492,492,0,1,male,0,0,30.5,S,1,1,0,1
55,55,1,1,male,0,0,35.5,S,1,1,0,1
243,243,0,3,male,0,0,7.125,S,1,1,0,1


In [9]:
train.survived.value_counts()

0    329
1    205
Name: survived, dtype: int64

In [10]:
baseline_accuracy = 329 / (329+205)
print(f'Baseline Accuracy: {baseline_accuracy:.02%}')

Baseline Accuracy: 61.61%


In [11]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S
455,455,1,3,male,0,0,7.8958,C,1,1,0,0
380,380,1,1,female,0,0,227.525,C,1,0,0,0
492,492,0,1,male,0,0,30.5,S,1,1,0,1
55,55,1,1,male,0,0,35.5,S,1,1,0,1
243,243,0,3,male,0,0,7.125,S,1,1,0,1


In [12]:
target = 'survived'

In [13]:
y_train = train[target]
y_train.head()

455    1
380    1
492    0
55     1
243    0
Name: survived, dtype: int64

In [14]:
y_validate = validate[target]
y_test = test[target]

-

In [15]:
X_train = train[train.columns[2:]].drop(columns=['sex', 'embarked'])
X_train.head()

Unnamed: 0,pclass,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
455,3,0,0,7.8958,1,1,0,0
380,1,0,0,227.525,1,0,0,0
492,1,0,0,30.5,1,1,0,1
55,1,0,0,35.5,1,1,0,1
243,3,0,0,7.125,1,1,0,1


In [16]:
X_validate = validate[validate.columns[2:]].drop(columns=['sex','embarked'])
X_test = test[test.columns[2:]].drop(columns=['sex','embarked'])

#### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [24]:
rf = RandomForestClassifier(random_state=123, min_samples_leaf=1, max_depth=10)
rf

In [25]:
rf.fit(X_train, y_train)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.



In [26]:
rf.score(X_train, y_train)

0.9438202247191011

In [27]:
rf.score(X_validate, y_validate)

0.7584269662921348

In [28]:
y_pred = rf.predict(X_train)

In [29]:
confusion_matrix(y_train, y_pred)

array([[324,   5],
       [ 25, 180]])

In [30]:
labels = sorted(y_train.unique())
labels

[0, 1]

In [31]:
pd.DataFrame(confusion_matrix(y_train, y_pred),
             index=[str(label) + '_actual' for label in labels],
             columns=[str(label) + '_predict' for label in labels])

Unnamed: 0,0_predict,1_predict
0_actual,324,5
1_actual,25,180


In [32]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       329
           1       0.97      0.88      0.92       205

    accuracy                           0.94       534
   macro avg       0.95      0.93      0.94       534
weighted avg       0.95      0.94      0.94       534



#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



#### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.



#### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

