# Neural Networks Assignment: Norberto Rancharan


### For this assignment use data at: “https://www.kaggle.com/wendykan/lending-club-loan-data/download”


# Importing Necessary Libraries

In [1]:
# Import packages
import pandas as pd
import pickle
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import ensemble
from sklearn.externals import joblib

#Plotly visualizations
from plotly import tools
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_rows', 600)

# Pickle for Faster Reading In

In [3]:
model_data = pickle.load(open("model_Data.pickle", "rb" ))

In [4]:
model_data['Delinquent'] = model_data['loan_status']

In [5]:
model_data = model_data.drop(columns=['loan_status'],axis=1)


In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_set, test_set in stratified.split(model_data, model_data["Delinquent"]):
    stratified_train = model_data.loc[train_set]
    stratified_test = model_data.loc[test_set]
    
print('Train set ratio \n', stratified_train["Delinquent"].value_counts()/len(model_data))
print('Test set ratio \n', stratified_test["Delinquent"].value_counts()/len(model_data))

Train set ratio 
 0    0.694183
1    0.105817
Name: Delinquent, dtype: float64
Test set ratio 
 0    0.173546
1    0.026454
Name: Delinquent, dtype: float64


In [7]:
train_df = stratified_train
test_df = stratified_test


# Let's Shuffle the data
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)


# Train set (Normal training dataset)
X_train = train_df.drop("Delinquent", axis=1)
y_train = train_df["Delinquent"]


# Test Dataset
X_test = test_df.drop("Delinquent", axis=1)
y_test = test_df["Delinquent"]

In [8]:
important_features = ['grade', 
                      'hardship_last_payment_amount',
                      'inq_last_12m',
                      'acc_open_past_24mths',
                      'open_il_24m',
                      'inq_last_6mths',
                      'hardship_payoff_balance_amount',
                      'loan_amount',
                      'all_util',
                      'avg_cur_bal',
                      'annual_income',
                      'dti']

### Creating Copies of Test and Train of X 

In [9]:
X_test_copy = X_test.copy()
X_train_copy = X_train.copy()
y_test_copy = y_test.copy()
y_train_copy = y_train.copy()

### Using the Important Features Identified from Grid Search in the Ensemble project

In [10]:
X_trainNN = X_train_copy[important_features]
X_testNN = X_test_copy[important_features]

### Converting the Train and Test to Array to use in NN Model

In [11]:
X_testNN = X_testNN.values
X_trainNN = X_trainNN.values

In [12]:
y_trainNN = y_train_copy.values
y_testNN = y_test_copy.values

In [13]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_trainNN = sc.fit_transform(X_trainNN)
X_testNN = sc.transform(X_testNN)

In [14]:
X_testNN

array([[ 0.52726338, -0.06854699,  1.26920797, ...,  0.1812274 ,
        -0.07300007, -0.02748638],
       [ 1.32236469, -0.06854699, -0.78789298, ...,  0.17945548,
         0.04019562, -0.02748582],
       [-1.06293924, -0.06854699, -0.78789294, ...,  0.17862231,
        -0.2160115 , -0.02748171],
       ...,
       [-0.26783793, -0.06854699, -0.78789294, ...,  0.17938528,
        -0.15789683, -0.0274842 ],
       [ 1.32236469, -0.06854699, -0.78789296, ...,  0.17876065,
        -0.00508266, -0.02748569],
       [-0.26783793, -0.06854699, -0.78789298, ...,  0.17976103,
        -0.01640223, -0.02748193]])

In [15]:
from tensorflow import keras
from tensorflow.python.keras import backend as k

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [17]:
# first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
classifier = Sequential()

In [18]:
# add the first hidden layer
classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))

In [19]:
# add the second hidden layer
classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))

In [20]:
# add the output layer
classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))

In [21]:
# compiling the NN
classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [22]:
# train the model
classifier.fit(X_trainNN,y_trainNN,batch_size=10,epochs=20)

Train on 1808534 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2198b1bc748>

In [23]:
# Predicting the Test set results
y_pred = classifier.predict(X_testNN)

In [24]:
y_pred

array([[0.11002514],
       [0.0269669 ],
       [0.15469787],
       ...,
       [0.12921911],
       [0.02737189],
       [0.12034363]], dtype=float32)

In [25]:
y_pred = (y_pred>0.5)

In [26]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_testNN, y_pred)

In [27]:
cm

array([[391420,    910],
       [ 58552,   1252]], dtype=int64)

In [28]:
# k-fold clustering
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

Using TensorFlow backend.


In [34]:
def build_classifier():
    # first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
    classifier = Sequential()
    # add the first hidden layer
    classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the second hidden layer
    classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the output layer
    classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))
    # compiling the NN
    classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])
    return classifier

In [35]:
classifier = KerasClassifier(build_fn=build_classifier, batch_size = 6, nb_epoch = 3)
accuracies = cross_val_score(estimator=classifier, X=X_trainNN, y = y_trainNN, cv=10, n_jobs=1)

Train on 1627680 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627680 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627680 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627680 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627681 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627681 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627681 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627681 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627681 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 1627681 samples


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [36]:
accuracies.mean()

0.8681617379188538

In [37]:
accuracies.std()

0.0010314051042729282

In [50]:
# Drop Out Regularization
# You can detect overfitting with the difference between test and train error or the high variance in cross-validation.

from tensorflow.keras.layers import Dropout

# first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
classifier = Sequential()

# add the first hidden layer
classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
classifier.add(Dropout(0.2))  #often start with 0.1, not solved go up

# add the second hidden layer
classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
classifier.add(Dropout(0.2))  #often start with 0.1, not solved go up

# add the output layer
classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))
# compiling the NN
classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

In [51]:
# fine tuning with Grid Search
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

def build_classifier(optimizer):
    # first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
    classifier = Sequential()
    # add the first hidden layer
    classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the second hidden layer
    classifier.add(Dense(units=5,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the output layer
    classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))
    # compiling the NN
    classifier.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['acc'])
    return classifier

In [53]:
classifier = KerasClassifier(build_fn=build_classifier)

# create a dictionary of hyper-parameters to optimize
parameters = {'batch_size':[25,32], 'nb_epoch':[1,2],'optimizer':['adam','rmsprop']}
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv=10)
grid_search = grid_search.fit(X_trainNN,y_trainNN)

best_parameters = grid_search.best_params_ 
best_accuracy = grid_search.best_score_

Train on 1627680 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627680 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
Train on 1627681 samples
