## 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [1]:
import pydataset
from env import get_db_url

import pandas as pd
import numpy as np

from prepare import prep_titanic
from prepare import titanic_split

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = prep_titanic()
df['baseline_prediction'] = 0
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embarked,alone,sex_male,embarked_Q,embarked_S,baseline_prediction
0,0,0,3,male,1,0,7.25,S,0,1,0,1,0
1,1,1,1,female,1,0,71.2833,C,0,0,0,0,0
2,2,1,3,female,0,0,7.925,S,1,0,0,1,0
3,3,1,1,female,1,0,53.1,S,0,0,0,1,0
4,4,0,3,male,0,0,8.05,S,1,1,0,1,0


In [3]:
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

## Baseline Accuracy

In [4]:
baseline_accuracy = (df.survived == df.baseline_prediction).mean()
baseline_accuracy

0.6161616161616161

## Cleaning the Data a little further

In [5]:
df['fare'] = df['fare'].astype('float64')

In [6]:
bins = [0, 25, 105, 205, 405, 600]
labels = [1, 2, 3, 4, 5]
df['fare_bin'] = pd.cut(df['fare'], bins=bins, labels=labels)

In [7]:
# df.pclass.get_dummies()
# Assuming df is your DataFrame
dummy_df = pd.get_dummies(df['pclass'], prefix='pclass')

# Concatenate the dummy variables with the original DataFrame
df = pd.concat([df, dummy_df], axis=1)

In [8]:
df.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'sibsp', 'parch', 'fare',
       'embarked', 'alone', 'sex_male', 'embarked_Q', 'embarked_S',
       'baseline_prediction', 'fare_bin', 'pclass_1', 'pclass_2', 'pclass_3'],
      dtype='object')

In [9]:
df.dtypes

passenger_id              int64
survived                  int64
pclass                    int64
sex                      object
sibsp                     int64
parch                     int64
fare                    float64
embarked                 object
alone                     int64
sex_male                  uint8
embarked_Q                uint8
embarked_S                uint8
baseline_prediction       int64
fare_bin               category
pclass_1                  uint8
pclass_2                  uint8
pclass_3                  uint8
dtype: object

## Train, Validate, Test

In [10]:
df, train, validate, test = titanic_split(df)

In [11]:
# inspect
train.info(), validate.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 455 to 496
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   passenger_id         534 non-null    int64   
 1   survived             534 non-null    int64   
 2   pclass               534 non-null    int64   
 3   sex                  534 non-null    object  
 4   sibsp                534 non-null    int64   
 5   parch                534 non-null    int64   
 6   fare                 534 non-null    float64 
 7   embarked             534 non-null    object  
 8   alone                534 non-null    int64   
 9   sex_male             534 non-null    uint8   
 10  embarked_Q           534 non-null    uint8   
 11  embarked_S           534 non-null    uint8   
 12  baseline_prediction  534 non-null    int64   
 13  fare_bin             525 non-null    category
 14  pclass_1             534 non-null    uint8   
 15  pclass_2             

(None, None, None)

## Removing the lesser columns

In [12]:
# create X & y version of train/validate/test
# where X contains the features we want to use and y is a series with just the target variable

X_train = train.drop(columns=['passenger_id', 'survived', 'sex', 'fare', 'embarked', 'embarked_Q', 
        'embarked_S', 'baseline_prediction'])
y_train = train.survived
X_validate = validate.drop(columns=['passenger_id', 'survived', 'sex',  'fare', 'embarked', 'embarked_Q', 
        'embarked_S', 'baseline_prediction'])
y_validate = validate.survived
X_test = test.drop(columns=['passenger_id', 'survived', 'sex', 'fare', 'embarked', 'embarked_Q', 
        'embarked_S', 'baseline_prediction'])
y_test = test.survived

## Columns to train and validate

In [13]:
#Compare df columns

print(f"""train:
{train.columns.to_list()} 
_____________________________________________
X_train:
{X_train.columns.to_list()}
_____________________________________________
X_validate:
{X_validate.columns.to_list()}
""")

train:
['passenger_id', 'survived', 'pclass', 'sex', 'sibsp', 'parch', 'fare', 'embarked', 'alone', 'sex_male', 'embarked_Q', 'embarked_S', 'baseline_prediction', 'fare_bin', 'pclass_1', 'pclass_2', 'pclass_3'] 
_____________________________________________
X_train:
['pclass', 'sibsp', 'parch', 'alone', 'sex_male', 'fare_bin', 'pclass_1', 'pclass_2', 'pclass_3']
_____________________________________________
X_validate:
['pclass', 'sibsp', 'parch', 'alone', 'sex_male', 'fare_bin', 'pclass_1', 'pclass_2', 'pclass_3']



## Creating A function to Fit and Transom my data

In [14]:
def random_forest(k, l, X, y):
    
    rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=l,
                            n_estimators=100,
                            max_depth=k, 
                            random_state=123)
    rf_fit = rf.fit(X_train, y_train)
    
    y_train_pred = rf.predict(X_train)
    y_val_pred = rf.predict(X)
    
    y_proba = rf.predict_proba(X)
    
    
    return y_val_pred, y_train_pred, y_proba, k, rf, rf_fit

In [15]:
y_val_pred, y_train_pred, y_proba, k, rf, rf_fit = random_forest(10, 1, X_train, y_train)

## 2. Evaluate your results using the model score, confusion matrix, and classification report.


## Creating a Functionn to produce metrics

In [16]:
def decision_metrics(X, y, y_pred):
    score = rf.score(X, y)
    cm = confusion_matrix(y, y_pred)
    
    cmdf = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], 
                   columns=['Pred 0', 'Pred 1'])
    
    report = classification_report(y, y_pred)
    
    return score, cm, cmdf, report

### Extracting the variables from the function and creating a list 

In [17]:
score, cm, cmdf, report = decision_metrics(X_train, y_train, y_train_pred)
metric_list = [score, cm, cmdf, report]

### Using the list to print out clearly

In [18]:
list = ['The train score for the model is:', 'Confusion Matrix:', 'Confusion Matrix DataFrame:', 'Classification Report:']
print(f'Model with Max Depth {k}:')
print()
for i, metric in enumerate(metric_list): # to add number to a tuple ex.: (o, seq[0]), (1, seq[1])
    if i == 2:
        print(list[2])
        display(metric)
        print('--------')
    else:
        print(list[i])
        print(metric)
        print('--------')

Model with Max Depth 10:

The train score for the model is:
0.8520599250936329
--------
Confusion Matrix:
[[302  27]
 [ 52 153]]
--------
Confusion Matrix DataFrame:


Unnamed: 0,Pred 0,Pred 1
Actual 0,302,27
Actual 1,52,153


--------
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       329
           1       0.85      0.75      0.79       205

    accuracy                           0.85       534
   macro avg       0.85      0.83      0.84       534
weighted avg       0.85      0.85      0.85       534

--------


## 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.


In [19]:
def print_cm_metrics(cm):    
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp + tn)/(tn + fp + fn + tp)

    true_positive_rate = tp/(tp + fn)
    false_positive_rate = fp/(fp + tn)
    true_negative_rate = tn/(tn + fp)
    false_negative_rate = fn/(fn + tp)

    precision = tp/(tp + fp)
    recall = tp/(tp + fn)
    f1_score = 2*(precision*recall)/(precision+recall)

    support_pos = tp + fn
    support_neg = fp + tn

    dict = {
        'metric' : ['accuracy'
                    ,'true_positive_rate'
                    ,'false_positive_rate'
                    ,'true_negative_rate'
                    ,'false_negative_rate'
                    ,'precision'
                    ,'recall'
                    ,'f1_score'
                    ,'support_pos'
                    ,'support_neg']
        ,'score' : [accuracy
                    ,true_positive_rate
                    ,false_positive_rate
                    ,true_negative_rate
                    ,false_negative_rate
                    ,precision
                    ,recall
                    ,f1_score
                    ,support_pos
                    ,support_neg]
    }

    return pd.DataFrame(dict)

In [20]:
print_cm_metrics(cm)

Unnamed: 0,metric,score
0,accuracy,0.85206
1,true_positive_rate,0.746341
2,false_positive_rate,0.082067
3,true_negative_rate,0.917933
4,false_negative_rate,0.253659
5,precision,0.85
6,recall,0.746341
7,f1_score,0.794805
8,support_pos,205.0
9,support_neg,329.0


## 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [21]:
i = 10
j = 1
train_depth = []
train_score_list = []

while i < 11:
        
        y_val_pred, y_train_pred, y_proba, k, rf_fit = random_forest(i, j, X_train, y_train)
        
        print(f'Training Model with Max Depth of: {i}')
        # print(k)
        
        # print(f'Mean:{y_val_pred.mean()}')
        # Compute score
        
        # print(y_val_pred.sum())
        score = clf.score(X_train, y_train)
        train_score_list.append(score)
        train_depth.append(i)
        
        basel
        # Print the model's accuracy and other information
        print(f"Model's Accuracy: {score}")
        print(f"Difference between Model and Basleine Accuracy: {score - baseline_accuracy}")
        print('-----------------')

        # Increment 'i' for the next iteration
        i += 1

TypeError: random_forest() missing 1 required positional argument: 'y'

## 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?