# Train 5 random forests in parallel

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score,confusion_matrix
from sklearn.model_selection import train_test_split
import copy

#### Used the best hyperparameters as determined by Geeling

In [3]:
def standard_scalar_normalize(data_train, data_test): 
    scaler = StandardScaler()
    scaler.fit(data_train)
    normal_data_train = scaler.transform(data_train)
    normal_data_test = scaler.transform(data_test)
    return normal_data_train,normal_data_test

In [4]:
def data_processed():
    '''
    Used to read & normalize processed test and train data
    '''
    data_train=pd.read_csv("data/2022-02-07_LOANS_TRAIN.csv")
    data_test=pd.read_csv("data/2022-02-07_LOANS_TEST.csv")
    y_train=data_train['loan_status']
    data_train.drop(columns=['loan_status', 'id', 'issue_d_in_months', 'issue_d_year', 'zip_state_match'], inplace=True)
    
    data_test.drop(columns=['id', 'issue_d_in_months', 'issue_d_year', 'zip_state_match'], inplace=True)
    
    normal_data_train,normal_data_test=standard_scalar_normalize(data_train,data_test)
    return normal_data_train,normal_data_test,y_train

def AUC_score(y_ground_truth,y_predicted_probability):
    return roc_auc_score(y_ground_truth, y_predicted_probability)

def to_submission(ids, y_test_predicted_probability):
    y_test=pd.DataFrame(y_test_predicted_probability,columns=['loan_status'], index=ids)
    y_test.index.name = 'id'
    y_test.to_csv('data/submission.csv')
    return

In [5]:
X_train, X_test, Y_train = data_processed()
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

print(f"""
X_train shape: {X_train.shape}
y_train shape: {Y_train.shape}
X_val shape: {X_val.shape}
Y_val shape: {Y_val.shape}
""")


X_train shape: (132157, 89)
y_train shape: (132157,)
X_val shape: (65093, 89)
Y_val shape: (65093,)



### I'll shuffle the training data, then split it into 5 data sets (since there is ~5x as many data points for the majority class (Paid Off) than the minority classs (Charged Off)) 

In [8]:
count_0 = np.count_nonzero(Y_train == 0)
count_1 = np.count_nonzero(Y_train == 1)

factor = count_0/count_1

print(f"There are {count_0} training examples for class 0 and {count_1} training examples for class 1.")
print(f"There is approx {factor:.2f} times as many examples for class 0 than class 1.")

There are 111917 training examples for class 0 and 20240 training examples for class 1.
There is approx 5.53 times as many examples for class 0 than class 1.


For now, I'll just do 5 models and ignore the last 0.53 part of the training set.

In [9]:
# separate all the class 0 from the class 1 examples

indices_0 = np.where(Y_train == 0)
indices_1 = np.where(Y_train == 1)

training_0 = X_train[indices_0, :][0]
training_1 = X_train[indices_1, :][0]

print(f"""
class 0 : {training_0.shape}
class 1 : {training_1.shape}
""")


class 0 : (111917, 89)
class 1 : (20240, 89)



Now that I've split the class 0 and class 1, I'm going to create 5 balanced datasets for each of the random forests

In [10]:
def balance_dataset(start_index, class_0_examples, class_1_examples):
    end = start_index + 20240
    
    training_0_1 = class_0_examples[start_index:end, :]

    # combine and shuffle, with the training labels!
    new_x_train = np.concatenate((training_0_1, class_1_examples))
    new_y_train = np.concatenate((np.zeros(20240), np.ones(20240)))
    
    print(f"x_train shape: {new_x_train.shape} and y_train shape: {new_y_train.shape}")

    new_x_train, new_y_train = shuffle(new_x_train, new_y_train, random_state=0)
    
    return new_x_train, new_y_train

In [11]:
np.arange(0, 111917, 20240) #these are the starting indicies 

array([     0,  20240,  40480,  60720,  80960, 101200])

In [12]:
X_train_1, Y_train_1 = balance_dataset(0, training_0, training_1)
X_train_2, Y_train_2 = balance_dataset(20240, training_0, training_1)
X_train_3, Y_train_3 = balance_dataset(40480, training_0, training_1)
X_train_4, Y_train_4 = balance_dataset(60720, training_0, training_1)
X_train_5, Y_train_5 = balance_dataset(80960, training_0, training_1)

x_train shape: (40480, 89) and y_train shape: (40480,)
x_train shape: (40480, 89) and y_train shape: (40480,)
x_train shape: (40480, 89) and y_train shape: (40480,)
x_train shape: (40480, 89) and y_train shape: (40480,)
x_train shape: (40480, 89) and y_train shape: (40480,)


I also want to try adding a 6th model that will be inbalanced in favor of the minority class

In [13]:
# # split all into train and validation
# X_train_1, X_val_1, Y_train_1, Y_val_1 = train_test_split(X_train_1, Y_train_1, train_size=0.33, random_state=42)
# X_train_2, X_val_2, Y_train_2, Y_val_2 = train_test_split(X_train_2, Y_train_2, train_size=0.33, random_state=42)
# X_train_3, X_val_3, Y_train_3, Y_val_3 = train_test_split(X_train_3, Y_train_3, train_size=0.33, random_state=42)
# X_train_4, X_val_4, Y_train_4, Y_val_4 = train_test_split(X_train_4, Y_train_4, train_size=0.33, random_state=42)
# X_train_5, X_val_5, Y_train_5, Y_val_5 = train_test_split(X_train_5, Y_train_5, train_size=0.33, random_state=42)

#### Random Forest #1

In [14]:
rfc = RandomForestClassifier(class_weight='balanced', max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(X_train_1, Y_train_1)
Y_train_pred_prob_1 = rfc.predict_proba(X_val)[:,1]
AUC_score(Y_val, Y_train_pred_prob_1) 

0.685744850037292

#### Random Forest #2

In [15]:
rfc = RandomForestClassifier(class_weight='balanced', max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(X_train_2, Y_train_2)
Y_train_pred_prob_2 = rfc.predict_proba(X_val)[:,1]
AUC_score(Y_val, Y_train_pred_prob_2) 

0.6859965747224266

#### Random Forest #3

In [16]:
rfc = RandomForestClassifier(max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(X_train_3, Y_train_3)
Y_train_pred_prob_3 = rfc.predict_proba(X_val)[:,1]
AUC_score(Y_val, Y_train_pred_prob_3) 

0.6857901419785312

#### Random Forest #4

In [17]:
rfc = RandomForestClassifier(class_weight={0:5, 1:1}, max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(X_train_4, Y_train_4)
Y_train_pred_prob_4 = rfc.predict_proba(X_val)[:,1]
AUC_score(Y_val, Y_train_pred_prob_4) 

0.6862713997515879

#### Random Forest #5

In [18]:
rfc = RandomForestClassifier(class_weight={0:1, 1:5}, max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(X_train_5, Y_train_5)
Y_train_pred_prob_5 = rfc.predict_proba(X_val)[:,1]
AUC_score(Y_val, Y_train_pred_prob_5) 

0.6866429154418214

### Now, combine the outputs of all the models

In [19]:
# try averaging probability
all_pred_prob = np.vstack((Y_train_pred_prob_1, 
                           Y_train_pred_prob_2,
                           Y_train_pred_prob_3, 
                           Y_train_pred_prob_4,
                           Y_train_pred_prob_5))

avg_pred = np.mean(all_pred_prob, axis=0)

# avg_pred[avg_pred > 0.5] = 1
# avg_pred[avg_pred <= 0.5] = 0

AUC_score(Y_val, avg_pred)

0.6871708010139346

In [21]:
# try feeding in the probabilities as features to a logistic regression model
X_blend = np.hstack((Y_train_pred_prob_1.reshape(-1,1), 
                     Y_train_pred_prob_2.reshape(-1,1),
                     Y_train_pred_prob_3.reshape(-1,1), 
                     Y_train_pred_prob_4.reshape(-1,1),
                     Y_train_pred_prob_5.reshape(-1,1)))

clf = LogisticRegression(random_state=0, max_iter=1000)
clf.fit(X_blend, Y_val)
Y_val_pred_prob = clf.predict_proba(X_blend)[:,1]
roc_auc_score(Y_val, Y_val_pred_prob)

0.6874262533785025

## Now, try a sequential ensemble model

In [9]:
rfc0 = RandomForestClassifier(class_weight='balanced', max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc0.fit(X_train, Y_train)
Y_train_pred_prob = rfc0.predict_proba(X_train)[:,1]

In [10]:
rfc = RandomForestClassifier(class_weight='balanced', max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(Y_train_pred_prob.reshape(-1, 1), Y_train)

ValueError: X has 89 features, but RandomForestClassifier is expecting 1 features as input.

In [12]:
Y_train_pred_prob = rfc.predict_proba(rfc0.predict_proba(X_train)[:,1].reshape(-1, 1))[:,1]

In [13]:
rfc = RandomForestClassifier(class_weight='balanced', max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(Y_train_pred_prob.reshape(-1, 1), Y_train)
Y_train_pred_prob = rfc.predict_proba(rfc0.predict_proba(X_train)[:,1].reshape(-1, 1))[:,1]

In [14]:
rfc = RandomForestClassifier(class_weight='balanced', max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(Y_train_pred_prob.reshape(-1, 1), Y_train)
Y_train_pred_prob = rfc.predict_proba(rfc0.predict_proba(X_train)[:,1].reshape(-1, 1))[:,1]

In [15]:
rfc = RandomForestClassifier(class_weight='balanced', max_depth=20, min_samples_leaf=25, n_estimators=400)
rfc.fit(Y_train_pred_prob.reshape(-1, 1), Y_train)

RandomForestClassifier(class_weight='balanced', max_depth=20,
                       min_samples_leaf=25, n_estimators=400)

In [16]:
Y_val_pred_prob = rfc0.predict_proba(X_val)[:,1]
Y_val_pred_prob_ = rfc.predict_proba(X_val)[:,1]

ValueError: X has 89 features, but RandomForestClassifier is expecting 1 features as input.

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

conf = confusion_matrix(y_val, prediction, normalize='true')
disp = ConfusionMatrixDisplay(conf)
disp.plot()