This activity focuses on building models using the `RandomForestClassifier` from scikit-learn.
To evaluate the model you will look to the out of bag data rather than a test set.

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
    
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [17]:
df = pd.read_csv("C:/Users/micls/OneDrive/Desktop/ML Projects/fetal_health.csv")

In [20]:
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [21]:
df['fetal_health'].value_counts()  #high bias, boosting should help

1.0    1655
2.0     295
3.0     176
Name: fetal_health, dtype: int64

In [22]:
df['fetal_health'].var()  #variance, bagging should help, RandomForestClassifier() boostraps already. 

0.3774589120690531

In [18]:
X, y = df.drop('fetal_health', axis = 1), df['fetal_health']

# The standard random forest classifier (Bagging)

In [19]:
random_state = 42
oob_score = True
forest_1 = RandomForestClassifier(oob_score=True, random_state=42).fit(X, y)
score = forest_1.oob_score_
print(score)

0.9444967074317968


Showing how increasing the number of trees has diminishing returns

In [13]:

n_trees = [1, 10, 100, 500, 1000, 2000]
oob_scores = []
for i in n_trees:
    forest = RandomForestClassifier(random_state=42, oob_score=True, n_estimators=i).fit(X, y)
    oob_scores.append(forest.oob_score_)
print(oob_scores)

[0.8287864534336783, 0.9158043273753528, 0.9444967074317968, 0.944967074317968, 0.9468485418626529, 0.9468485418626529]


changing the depth can also improve, but there is no one model that will perform best in every situation

In [15]:
depths = [1, 2, 3, 4, 5, None]

depth_oobs = []
for d in depths:
    forest = RandomForestClassifier(random_state=42, oob_score=True, n_estimators=200,max_depth = d).fit(X, y)
    depth_oobs.append(forest.oob_score_)
print(depth_oobs)

[0.7784571966133584, 0.8598306679209784, 0.8955785512699906, 0.9059266227657573, 0.9158043273753528, 0.9468485418626529]


# The Adaboost Classifier (Boosting)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42) #dtrees are not affected by scaling 

In [29]:
params = {'n_estimators': [100, 200],
         'base_estimator__max_depth': [1, 2, 3]}
tree_grid = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), random_state = 42), 
                         param_grid=params).fit(X_train, y_train)
grid_acc = tree_grid.score(X_test, y_test)

In [30]:
print(grid_acc)

0.9417293233082706
