# **05 Random Forest**

### Importing Packages

In [8]:
# Importing required packages
import os
import glob
import pandas as pd
import numpy as np
import pickle
from itertools import islice
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# Printing the working directory
os.getcwd()

'c:\\Users\\toofa\\OneDrive - Baylor College of Medicine\\APSA\\2024-07-29_Texas-STAR'

### Reading in cleaning the Texas STAR dataset

In [12]:
# Importing TexasSTAR
df = pd.read_excel("tables/df_clean.xlsx")
print(df.shape)

(40039, 62)


In [13]:
# Filtering and cleaning for Random Forest
df_clean = df[df["degrees"].isin(["MD", "MD-PhD", "MD-MPH", "MD-MBA", "MD-MSc"])]

# Filtering for columns
cols_to_keep = ['Matched', 'Public_School', "IvyP", 'step1_centered', 'step2_centered', "X..Honored.Clerkships", "Honors.A.This.Specialty", "AOA.Sigma", 'GHHS', 'Research.Year', 'Absence.Year', "Couples.Match", "X..Research.Experiences", "X..Abstracts..Pres..Posters", "X..Peer.Rev.Publications", "X..Volunteer.Experiences", "X..Leadership.Positions", 'Required.to.Remediate', 'degrees']  #degrees 
df_clean = df_clean[cols_to_keep]

# Dropping NA values 
df_clean = df_clean.dropna()

# Feature Encoding for binary yes/no features
label_encoder = LabelEncoder()
df_clean["Matched"] = label_encoder.fit_transform(df_clean["Matched"])
df_clean["Public_School"] = label_encoder.fit_transform(df_clean["Public_School"])
df_clean["IvyP"] = label_encoder.fit_transform(df_clean["IvyP"])
df_clean["Honors.A.This.Specialty"] = label_encoder.fit_transform(df_clean["Honors.A.This.Specialty"])
df_clean["AOA.Sigma"] = label_encoder.fit_transform(df_clean["AOA.Sigma"])
df_clean["GHHS"] = label_encoder.fit_transform(df_clean["GHHS"])
df_clean["Research.Year"] = label_encoder.fit_transform(df_clean["Research.Year"])
df_clean["Absence.Year"] = label_encoder.fit_transform(df_clean["Absence.Year"])
df_clean["Couples.Match"] = label_encoder.fit_transform(df_clean["Couples.Match"])
df_clean["Required.to.Remediate"] = label_encoder.fit_transform(df_clean["Required.to.Remediate"])

# Min-Max normalization for numerical data to fit the 0-1 range
scaler = MinMaxScaler()
numerical_features = ["step1_centered", "step2_centered", "X..Honored.Clerkships", "X..Research.Experiences", "X..Abstracts..Pres..Posters", "X..Peer.Rev.Publications", "X..Volunteer.Experiences", "X..Leadership.Positions", "AOA.Sigma", "GHHS"]
df_clean[numerical_features] = scaler.fit_transform(df_clean[numerical_features])

### Random Forest to predict match outcomes for ALL students

In [14]:
# Train test split
X = df_clean.iloc[:, 1:18]
y = df_clean.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=117, test_size=0.2, stratify=y)

In [15]:
# RandomForest Classifier
rf = RandomForestClassifier(random_state=0)

# hyperparameter tuning
cv_params = {'max_depth': [2, 4, 6, 8, 10, 12, None],
             'min_samples_leaf': [1, 2, 3, 4, 6, 8, 10],
             'min_samples_split': [2, 3, 4, 6, 8, 10],
             'max_features': [2, 3, 4, 6, 8, 10],
             'n_estimators': [2, 5, 10, 15, 20, 30, 40, 50, 100]}
 
# scoing metrics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# calling RF - takes a LONG time - only run once
rf_cv = GridSearchCV(rf, param_grid=cv_params, scoring=scoring, cv=10, refit="f1", n_jobs=20, verbose=2)
rf_cv.fit(X_train, y_train)

Fitting 10 folds for each of 15876 candidates, totalling 158760 fits


In [17]:
# Displaying the best parmeters
print(rf_cv.best_params_)
print(rf_cv.best_score_)

{'max_depth': 10, 'max_features': 6, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 40}
0.9369729682540114


In [18]:
def make_results(model_name, model_object):
    '''
    Accepts as arguments a model name (your choice - string) and
    a fit GridSearchCV model object.
  
    Returns a pandas df with the F1, recall, precision, and accuracy scores
    for the model with the best mean F1 score across all validation folds.  
    '''

    # Get all the results from the CV and put them in a df
    cv_results = pd.DataFrame(model_object.cv_results_)

    # Isolate the row of the df with the max(mean f1 score)
    best_estimator_results = cv_results.iloc[cv_results['mean_test_f1'].idxmax(), :]

    # Extract accuracy, precision, recall, and f1 score from that row
    f1 = best_estimator_results.mean_test_f1
    recall = best_estimator_results.mean_test_recall
    precision = best_estimator_results.mean_test_precision
    accuracy = best_estimator_results.mean_test_accuracy
  
    # Create table of results
    table = pd.DataFrame(
         {'model': [model_name],
          'precision': [precision],
          'recall': [recall],
          'F1': [f1],
          'accuracy': [accuracy],
         },
     )
  
    return table

In [19]:
# Performance metrics for ALL students
rf_cv_results = make_results(model_name="rf_all", model_object=rf_cv)
rf_cv_results

Unnamed: 0,model,precision,recall,F1,accuracy
0,rf_all,0.881743,0.999584,0.936973,0.881519


In [21]:
# Saving and Pickle the model
#with open('tables/rf_cv_model.pickle', 'wb') as to_write:
#    pickle.dump(rf_cv, to_write)
    
# Reading the Pickled rf model
with open('tables/rf_cv_model.pickle', 'rb') as to_read:
    rv_cv = pickle.load(to_read)

In [20]:
# Predicting based on the test dataset
y_pred = rf_cv.predict(X_test)
rf_cv.score(X_test, y_test)

0.9367905630981527

In [26]:
feat_importances = pd.DataFrame(rf_cv.best_estimator_.feature_importances_, index=X.columns).sort_values(0, ascending=False)
feat_importances


Unnamed: 0,0
step2_centered,0.124787
step1_centered,0.116756
X..Volunteer.Experiences,0.108681
X..Leadership.Positions,0.104958
X..Research.Experiences,0.104125
X..Abstracts..Pres..Posters,0.089429
X..Peer.Rev.Publications,0.085796
X..Honored.Clerkships,0.081325
GHHS,0.027059
AOA.Sigma,0.026751


### Running random forest with tuned hyperparameters on each individual degree path

In [27]:
def rf_spec(df, degree_path):
    """
    Function to run random forest classifier on specific degree paths
    """
    
    # Filtering the dataframe by degree path
    print("Running random forest classifier on: {}".format(degree_path))
    df_filt = df[df["degrees"]==degree_path]
    
    # Train test split
    X = df_filt.iloc[:, 1:18]
    y = df_filt.iloc[:, 0]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=117, test_size=0.2, stratify=y)
    
    # RandomForest Classifier
    rf = RandomForestClassifier(random_state=0)

    # Hyperparameter selection
    cv_params = {'max_depth': [10],
                'min_samples_leaf': [1],
                'min_samples_split': [4],
                'max_features': [6],
                'n_estimators': [40]}

    # scoing metrics
    scoring = ['accuracy', 'precision', 'recall', 'f1']

    # calling RF - takes a LONG time - only run once
    rf_cv = GridSearchCV(rf, param_grid=cv_params, scoring=scoring, cv=10, refit="f1", n_jobs=10, verbose=1)
    rf_cv.fit(X_train, y_train)
    
    # Predicting and scoring
    y_pred = rf_cv.predict(X_test)
    print(rf_cv.score(X_test, y_test))
    
    # Classification report
    c_report = classification_report(y_test, y_pred)
    
    # Performanace metrics
    rf_cv_results = make_results(model_name=degree_path, model_object=rf_cv)
    display(rf_cv_results)
    
    # Feature importances
    features = pd.DataFrame(rf_cv.best_estimator_.feature_importances_, index=X.columns).sort_values(0, ascending=False)
    
    # Cleaning and exporting
    features["degree"] = degree_path
    features = features.reset_index(drop=False)
    features.columns = ["feature", "importance", "degree"]
    features['size'] = X.shape[0]
    return features, rf_cv_results

In [28]:
# Running random forest on each degree path
feat_md, res_md = rf_spec(df=df_clean, degree_path="MD")
feat_mdphd, res_mdphd = rf_spec(df=df_clean, degree_path="MD-PhD")
feat_mdmph, res_mdmph = rf_spec(df=df_clean, degree_path="MD-MPH")
feat_mdmba, res_mdmba = rf_spec(df=df_clean, degree_path="MD-MBA")
feat_mdmsc, res_mdmsc = rf_spec(df=df_clean, degree_path="MD-MSc")

Running random forest classifier on: MD
Fitting 10 folds for each of 1 candidates, totalling 10 fits
0.938337801608579


Unnamed: 0,model,precision,recall,F1,accuracy
0,MD,0.884438,0.999429,0.938424,0.884057


Running random forest classifier on: MD-PhD
Fitting 10 folds for each of 1 candidates, totalling 10 fits
0.9215686274509803


Unnamed: 0,model,precision,recall,F1,accuracy
0,MD-PhD,0.877795,0.97925,0.925661,0.862528


Running random forest classifier on: MD-MPH
Fitting 10 folds for each of 1 candidates, totalling 10 fits
0.9309576837416481


Unnamed: 0,model,precision,recall,F1,accuracy
0,MD-MPH,0.887,0.984706,0.933224,0.875762


Running random forest classifier on: MD-MBA
Fitting 10 folds for each of 1 candidates, totalling 10 fits
0.9253731343283582


Unnamed: 0,model,precision,recall,F1,accuracy
0,MD-MBA,0.885712,0.973504,0.92718,0.865057


Running random forest classifier on: MD-MSc
Fitting 10 folds for each of 1 candidates, totalling 10 fits
0.912621359223301


Unnamed: 0,model,precision,recall,F1,accuracy
0,MD-MSc,0.848936,0.983364,0.911188,0.837546


In [30]:
# Merging RF results for all degree paths
features = pd.concat([feat_md, feat_mdphd, feat_mdmph, feat_mdmba, feat_mdmsc], axis=0)
scores = pd.concat([res_md, res_mdphd, res_mdmph, res_mdmba, res_mdmsc], axis=0)
display(features)
display(scores)

# exporting to excel
#features.to_excel("tables/random_forest_feature_importances.xlsx", index=False)
#scores.to_excel("tables/random_forest_model_scores.xlsx", index=False)

Unnamed: 0,feature,importance,degree,size
0,step2_centered,0.119766,MD,19806
1,step1_centered,0.118407,MD,19806
2,X..Leadership.Positions,0.107200,MD,19806
3,X..Volunteer.Experiences,0.102633,MD,19806
4,X..Research.Experiences,0.096434,MD,19806
...,...,...,...,...
12,Absence.Year,0.020664,MD-MSc,1685
13,Couples.Match,0.016605,MD-MSc,1685
14,IvyP,0.014778,MD-MSc,1685
15,Research.Year,0.012866,MD-MSc,1685


Unnamed: 0,model,precision,recall,F1,accuracy
0,MD,0.884438,0.999429,0.938424,0.884057
0,MD-PhD,0.877795,0.97925,0.925661,0.862528
0,MD-MPH,0.887,0.984706,0.933224,0.875762
0,MD-MBA,0.885712,0.973504,0.92718,0.865057
0,MD-MSc,0.848936,0.983364,0.911188,0.837546


In [29]:
df_clean.shape

(23897, 19)

In [25]:
df_clean['degrees'].value_counts()

degrees
MD        19806
MD-MSc     1685
MD-MPH     1208
MD-PhD      828
MD-MBA      370
Name: count, dtype: int64