# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import psycopg2 as pg
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', palette = 'Paired')
#plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']
np.set_printoptions(suppress=True) # Suppress scientific notation where possible
from ipywidgets import interactive, FloatSlider

from sklearn.inspection import permutation_importance
from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix,\
roc_auc_score, roc_curve, precision_recall_curve, f1_score, fbeta_score, recall_score,\
precision_recall_fscore_support

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)
from sklearn.metrics import accuracy_score, make_scorer, log_loss

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from mlxtend.plotting import plot_decision_regions

import xgboost as xgb

# Import Data

In [3]:
df = pd.read_csv('/Users/dominguez/Documents/project3/Data/df.csv')

## Important Features

The following features will be used for the deployed model

1. September History
2. August History
3. July History
4. June History

## Revised Model

In [5]:
# Establish features and target variables

X_rev = df[['Sep_Hist','Aug_Hist','Jul_Hist', 'Jun_Hist']]

y_rev = df['Default']

# hold out 20% of the data for final testing
X_rev, X_rev_test, y_rev, y_rev_test = train_test_split(X_rev, y_rev, test_size=.2, random_state=42)

# Split train and validate sets
X_rev_train, X_rev_val, y_rev_train, y_rev_val = train_test_split(X_rev, y_rev, test_size=.25, random_state=42)

#this helps with the way kf will generate indices below
X_rev, y_rev = np.array(X_rev), np.array(y_rev)

# Scale features

scaler = StandardScaler()
X_rev_train_scaled = scaler.fit_transform(X_rev_train)
X_rev_val_scaled = scaler.transform(X_rev_val)

# Pickle the scaler to be used in the web app

with open('/Users/dominguez/Documents/project3/web_app/scaler.pickle', 'wb') as to_write:
    pickle.dump(scaler, to_write)
    
# Oversampling Cross Validation

kf = KFold(n_splits=10, shuffle=True, random_state = 42)
rf_rev_scores = []
rus = RandomUnderSampler(random_state=42)

for train_ind, val_ind in kf.split(X_rev,y_rev):

    X_train, y_train = X_rev[train_ind], y_rev[train_ind]
    X_val, y_val = X_rev[val_ind], y_rev[val_ind]
    
    # Undersample the data
    X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_under_scaled = scaler.fit_transform(X_train_under)
    X_val_scaled = scaler.transform(X_val)
    
    # Random Forest
    rf_rev = RandomForestClassifier(n_estimators=150, max_depth=10, min_samples_split=5,
                                      min_samples_leaf=2, bootstrap=True, max_features='sqrt')
    rf_rev.fit(X_train_under_scaled, y_train_under)
    rf_rev_scores.append(rf_rev.score(X_val_scaled, y_val))
        
rf_val_rev_score = round(np.mean(rf_rev_scores),4)

print(f'Random forest oversampling val score: {rf_val_rev_score}')

# Pickle trained model to use in web app

with open('/Users/dominguez/Documents/project3/web_app/rf_rev.pickle', 'wb') as to_write:
    pickle.dump(rf_rev, to_write)
    
# Test

X_rev_test_scaled = scaler.transform(X_rev_test)

print(f'Random forest oversampling test score: {rf_rev.score(X_rev_test_scaled,y_rev_test)}')

Random forest oversampling val score: 0.7701
Random forest oversampling test score: 0.7725
