In [31]:
# install package needed for SMOTE
!pip install imblearn
# install package for tree visualization
!pip install graphviz

[0m

In [42]:
# load dependencies
import pandas as pd # for importing and handling data
import numpy as np # for working with arrays
from sklearn.model_selection import train_test_split # for splitting data
from imblearn.over_sampling import SMOTE # for smote oversampling

# imports from other code file (figure out what does what later)
import math
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, f1_score
from scipy.stats import pearsonr

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [26]:
# load data (you might have to change the filepath -- my project files are in a sub-folder called final_project)
seed = 2001

df = pd.read_csv("/home/jupyter/final_project/pisa_median.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,ST352Q06JA,AGE,ST004D01T,DURECEC,REPEAT,MISSSC,SKIPPING,TARDYSD,EXPECEDU,...,ST353Q07JA,ST353Q08JA,ST348Q01JA,ST348Q02JA,ST348Q03JA,ST348Q04JA,ST348Q05JA,ST348Q06JA,ST348Q07JA,ST348Q08JA
0,1.0,2.0,15.58,1.0,2.0,0.0,0.0,1.0,2.0,7.0,...,2.0,2.0,4.0,4.0,4.0,3.0,4.0,3.0,2.0,2.0
1,2.0,2.0,16.17,2.0,2.0,0.0,1.0,0.0,1.0,7.0,...,4.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,4.0,4.0
2,3.0,4.0,15.58,2.0,0.0,0.0,0.0,0.0,0.0,9.0,...,2.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,2.0,2.0
3,4.0,4.0,15.42,2.0,2.0,0.0,0.0,0.0,0.0,4.0,...,1.0,2.0,1.0,3.0,4.0,1.0,4.0,1.0,2.0,1.0
4,5.0,2.0,15.75,2.0,1.0,0.0,0.0,0.0,0.0,8.0,...,2.0,2.0,1.0,3.0,4.0,3.0,4.0,3.0,2.0,2.0


In [19]:
# split into target and features
y = df["ST352Q06JA"].values
X = df.drop(["ST352Q06JA"], axis=1).values

In [32]:
# split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, # common to use 20-30% of data as test set
                                                    random_state = seed, # set seed equivalent
                                                    stratify = y) # outcome amounts equal in train and test

In [33]:
# frequency of observations in training data pre-oversampling -- THIS BLOCK CAN BE DELETED
temp = y_train.astype(int)
np.bincount(temp)

array([   0, 1349, 2325, 2088, 3182])

In [34]:
# conduct oversampling on training data using SMOTE
# SMOTE overview here: https://www.youtube.com/watch?v=1Ic7GRtDrPM
smote = SMOTE(random_state = seed)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [37]:
# frequency of observations in training data post-oversampling -- THIS BLOCK CAN BE DELETED
temp2 = y_smote.astype(int)
np.bincount(temp2)

array([   0, 3182, 3182, 3182, 3182])

In [40]:
# define the parameter grid for hyperparameter tuning
param_grid = {'n_estimators': [50, 150, 250, 350, 450],
              'max_depth': [1,2,3,4,5,6,7,8,9,10],
              'max_features': [20, 50, 80, 110, 140]}

# define the evaluation metrics
metrics = {'MSE': mean_squared_error,
           'RMSE': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
           'R-squared': r2_score,
           'Accuracy': accuracy_score,
           'F1 Score': f1_score}

# create a random forest regressor
rf = RandomForestClassifier(random_state=seed)

In [51]:
# Create a RandomizedSearchCV object
rand_rf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    scoring = 'neg_mean_squared_error',
    n_iter = 1,
    n_jobs=4,
    cv=5, 
    refit = True,
    return_train_score=True,
    verbose=1)

# Fit the random search object to the data
rand_rf.fit(X_smote, y_smote)

rand_rf.cv_results_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


{'mean_fit_time': array([135.38765655]),
 'std_fit_time': array([35.06115804]),
 'mean_score_time': array([0.22703066]),
 'std_score_time': array([0.08383141]),
 'param_n_estimators': masked_array(data=[250],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=[140],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[7],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 250, 'max_features': 140, 'max_depth': 7}],
 'split0_test_score': array([-1.07109191]),
 'split1_test_score': array([-1.03849175]),
 'split2_test_score': array([-1.04320503]),
 'split3_test_score': array([-0.90491159]),
 'split4_test_score': array([-0.97524558]),
 'mean_test_score': array([-1.00658917]),
 'std_test_score': array([0.05973249]),
 'rank_test_score': array([1], dtype=int32),
 'split0_train_score': array([-0.8880377

In [50]:
# Create a variable for the best model
best_rf = rand_rf.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_rf.best_params_)

# Print out the score from the best-performing square
best_score = rand_rf.best_score_
print(best_score)

# Create a variable from the row related to the best-performing square
cv_results_df = pd.DataFrame(rand_rf.cv_results_)
best_row = cv_results_df.loc[[rand_rf.best_index_]]
print(best_row)

0.5329829444917549
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0      80.166609      18.77178         0.201535        0.085879   

  param_n_estimators param_max_features param_max_depth  \
0                250                140               7   

                                              params  split0_test_score  \
0  {'n_estimators': 250, 'max_features': 140, 'ma...           0.544997   

   split1_test_score  ...  mean_test_score  std_test_score  rank_test_score  \
0           0.525992  ...         0.532983        0.014079                1   

   split0_train_score  split1_train_score  split2_train_score  \
0            0.573305             0.56478            0.575402   

   split3_train_score  split4_train_score  mean_train_score  std_train_score  
0            0.572327            0.567356          0.570634         0.003943  

[1 rows x 23 columns]
