# Ensemble Voting Model

Combine probabilities from the following models with the same undersampled dataset: 
1. 1 neural network 
2. 1 logistic regression 
3. 1 non-linear SVM

Take the average probability of each class, then predict the class to be the one with the highest probability. 

In [22]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from torchvision import datasets, transforms

from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.metrics import roc_auc_score,confusion_matrix
import copy

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import regularizers

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
import bokeh
import bokeh.plotting
from bokeh.layouts import column, row
from bokeh.models import HoverTool,WheelZoomTool, PanTool, ResetTool
bokeh.io.output_notebook()

## Load, normalize, and split all the data into a training and validation set 

In [26]:
def data_processed():
    '''
    Used to read & normalize processed test and train data
    '''
    data_train = pd.read_csv("data/2022-02-07_LOANS_TRAIN.csv")
    data_test = pd.read_csv("data/2022-02-07_LOANS_TEST.csv")
    y_train = data_train['loan_status']
    data_train.drop('loan_status', 1, inplace=True)
    data_train.drop('id', 1, inplace=True)
    data_test.drop('id', 1, inplace=True)
    
    scaler = StandardScaler()
    scaler.fit(data_train)
    normal_data_train = scaler.transform(data_train)
    normal_data_test = scaler.transform(data_test)
    
    return normal_data_train, normal_data_test, y_train

def to_submission(y_test_predicted_probability):
    y_test = pd.DataFrame(y_test_predicted_probability, columns=['loan_status'])
    y_test.to_csv('data/submission.csv', index_label='id')
    return

In [27]:
X_train, X_test, y_train = data_processed()
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42, shuffle=True)

print(f"""Before random undersampling - 
X_train shape: {X_train.shape}
y_train shape: {y_train.shape}
X_val shape: {X_val.shape}
y_val shape: {y_val.shape}
""")

fraction = 0.5
rus = RandomUnderSampler(random_state=0, sampling_strategy=fraction)
X_train, y_train = rus.fit_resample(X_train, y_train)

print(f"""After random undersampling - 
X_train shape: {X_train.shape}
y_train shape: {y_train.shape}
X_val shape: {X_val.shape}
y_val shape: {y_val.shape}
""")

# change to one hot encoding
y_train_oh, y_val_oh = to_categorical(y_train), to_categorical(y_val)

  data_train.drop('loan_status', 1, inplace=True)
  data_train.drop('id', 1, inplace=True)
  data_test.drop('id', 1, inplace=True)
Feature names must be in the same order as they were in fit.



Before random undersampling - 
X_train shape: (132157, 92)
y_train shape: (132157,)
X_val shape: (65093, 92)
y_val shape: (65093,)

After random undersampling - 
X_train shape: (60720, 92)
y_train shape: (60720,)
X_val shape: (65093, 92)
y_val shape: (65093,)



In [28]:
def over_under_sampling(X_train,Y_train,method,fraction):
    if method=="randomU":
        rus = RandomUnderSampler(random_state=0,sampling_strategy=fraction)
    elif method=="randomO":
        rus = RandomOverSampler(random_state=0,sampling_strategy=fraction)
    elif method=="centroid":
        rus = ClusterCentroids(random_state=42,sampling_strategy=fraction)
    elif method=="SMOTENN":
        rus = SMOTEENN(random_state=42,sampling_strategy=fraction)
    X_resampled, y_resampled = rus.fit_resample(X_train, Y_train)
    return X_resampled, y_resampled

## Model 1: Neural network 
I'll use experiment 1a from notebook 2

In [29]:
network = models.Sequential()
network.add(layers.Dense(200, activation="leaky_relu", input_shape=(X_train.shape[1],)))
network.add(layers.Dropout(0.3))
network.add(layers.Dense(50, activation="leaky_relu"))
network.add(layers.Dense(2, activation="softmax"))
network.compile(loss="categorical_crossentropy",
                optimizer="adam",
                metrics=["accuracy"])

history = network.fit(X_train, y_train_oh, batch_size=16, epochs=10, validation_split=0.1, shuffle=True, verbose=False)

test_loss, test_acc = network.evaluate(X_val, y_val_oh)
print('the final accuracy on the holdout set was ', np.round(test_acc * 100, 2), '%')

y_pred_nn = network.predict(X_val)
print("ROC AUC score: ", roc_auc_score(y_val_oh, y_pred_nn))

the final accuracy on the holdout set was  83.34 %
ROC AUC score:  0.6835855108470055


## Model 2: Logistic Regression (undersampled)

In [30]:
logistic = LogisticRegression(max_iter=400, class_weight={0:1, 1:5}) 

# Fit the logistic regression model to the data
logistic.fit(X_train, y_train)   

logistic.score(X_val, y_val)

# convert to one-hot encoding
y_val_oh = to_categorical(y_val)
y_pred_lr = logistic.predict_proba(X_val)
print("ROC AUC score: ", roc_auc_score(y_val_oh, y_pred_lr))

ROC AUC score:  0.6833906214595225


## Model 3: Non-Linear SVM

In [31]:
scaler = StandardScaler()
transform = Nystroem(gamma=0.0001, kernel="rbf")
clf_sgd = SGDClassifier(alpha=1, shuffle=True)
pipeline = make_pipeline(scaler, transform, clf_sgd)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('nystroem', Nystroem(gamma=0.0001)),
                ('sgdclassifier', SGDClassifier(alpha=1))])

In [32]:
y_val_pred = pipeline.predict(X_val)

In [33]:
y_val_pred_prob = pipeline.decision_function(X_val)
y_val_pred_prob -= pipeline.decision_function(X_val).max()
y_val_pred_prob = np.exp(y_val_pred_prob)/sum(np.exp(y_val_pred_prob))


In [34]:
y_val_pred_prob -= pipeline.decision_function(X_val).max()
y_val_pred_prob = np.exp(y_val_pred_prob)/sum(np.exp(y_val_pred_prob))
roc_auc_score(y_val, y_val_pred_prob)

0.6787159969478902

## Model 4: AdaBoost

In [35]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
Y_train_pred_ada = clf.predict(X_train)
Y_val_pred_ada = clf.predict(X_val)
Y_val_pred_prob_ada = clf.predict_proba(X_val)
roc_auc_score(y_val, Y_val_pred_prob_ada[:,1])

0.6849315189119954

## Model 5: Random Forest

In [36]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, criterion = 'gini', min_samples_leaf=30, max_depth=20)
clf.fit(X_train, y_train, sample_weight=copy.deepcopy(y_train)*4 + 1)
Y_train_pred_rf=clf.predict(X_train)
Y_val_pred_rf=clf.predict(X_val)
Y_val_pred_prob_rf=clf.predict_proba(X_val)
roc_auc_score(y_val,Y_val_pred_prob_rf[:, 1])

0.6866096498559267

## Model 6: Logistic Regression (oversampled)

In [37]:
X_train_over,Y_train_over=over_under_sampling(X_train,y_train,method="randomO",fraction=1)
clf = LogisticRegression(random_state=0,max_iter=400).fit(X_train_over, Y_train_over)
Y_train_pred_log=clf.predict(X_train)
Y_val_pred_log=clf.predict(X_val)
Y_val_pred_prob_log=clf.predict_proba(X_val)
roc_auc_score(y_val,Y_val_pred_prob_log[:, 1])

0.6839934113965749

## Combine all 6 models

I'm going to ignore the svm for now

In [38]:
y_pred_nn = y_pred_nn[:, 1]
y_pred_lr = y_pred_lr[:, 1]

In [39]:
Y_average = np.mean(np.array([Y_val_pred_prob_log[:, 1],Y_val_pred_prob_ada[:, 1], Y_val_pred_prob_rf[:, 1], y_pred_lr, y_pred_nn]), axis=0)
Y_average[Y_average>=0.5] = 1
Y_average[Y_average<0.5] = 0
print(roc_auc_score(y_val,Y_average))

0.6332760072215273


In [40]:
confusion_matrix(y_val,Y_average)

array([[28980, 26129],
       [ 2589,  7395]])

Try combining w a logistic regression

## Output submission file

In [37]:
def to_submission(y_test_predicted_probability):
    y_test=pd.DataFrame(y_test_predicted_probability,columns=['loan_status'])
    y_test.index = np.arange(200000, 200000+len(y_test))
    y_test.to_csv('data/submission.csv', index_label='id')
    return