Random Forest

1. create a few different Tree Ensemble Methods
2. visualize feature importances, and compare individual trees from a Random Forest to see the differences in the features they were trained on.

Note that the model is already good enough! So no need to go through GridSearch, and optimization of random forest

In [None]:
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
%matplotlib inline
# import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
import pickle
from sklearn.preprocessing import OneHotEncoder

In [None]:
%run ./data_prep.ipynb

In [None]:
y.value_counts().plot(kind='bar');

In [None]:
tree = DecisionTreeClassifier(max_depth= 10) 
tree.fit(X_train_all, y_train)
pred = tree.predict(X_test_all)

In [None]:
def plot_feature_importances(model):
    n_features = X_train_all.shape[1]
    plt.figure(figsize=(10,10))
    features = dict(zip(X_train_all.columns, model.feature_importances_))
    sorted_fim = sorted(features.items(), key=lambda x: x[1])
    sorted_im = [i[1] for i in sorted_fim]
    sorted_f = [i[0] for i in sorted_fim]
    
    plt.barh(range(n_features), sorted_im, align='center') 
    plt.yticks(np.arange(n_features), sorted_f) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")

plot_feature_importances(tree)

In [None]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
print(f"Accuracy: {accuracy_score(y_test, pred)}")    #already a perfect model!

In [None]:
X_train_np = X_train_all.drop(labels = 'Popularity', axis= 1, inplace = False)
X_test_np = X_test_all.drop(labels = 'Popularity', axis= 1, inplace = False)

In [None]:
tree = DecisionTreeClassifier(max_depth = 10) 
tree.fit(X_train_np, y_train)
pred = tree.predict(X_test_np)
print(f"Accuracy: {accuracy_score(y_test, pred)}")    

In [None]:
rf = RandomForestClassifier(max_depth= 10)
rf.fit(X_train_np, y_train)

print(rf.score(X_train_np, y_train))
print(rf.score(X_test_np, y_test))

In [None]:
#Mean Cross Validation Score for Random Forest Classifier
mean_rf_cv_score = np.mean(cross_val_score(rf, X_train_np, y_train, cv=3))  
mean_rf_cv_score

In [None]:
rf_param_grid = {
    'n_estimators': [5, 10, 30, 100],
    'criterion': ['gini', 'entropy']
}
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=3)
rf_grid_search.fit(X_train_np, y_train)

print(rf_grid_search.best_score_) #Testing Accuracy
print(rf_grid_search.best_params_) #Optimal Parameters

In [None]:
print(rf_grid_search.score(X_test_np, y_test))

In [None]:
mean_rf_cv_score = np.mean(cross_val_score(rf_grid_search, X_train_np, y_train,cv =5))

print(f"Mean Cross Validation Score: {mean_rf_cv_score * 100}")