In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import auc, roc_curve
import time
from sklearn.ensemble import RandomForestClassifier

  from pandas.core import datetools


### The task: To Model a Dataset Using RFC and MLP Classifiers

We will model the credit card dataset, which has been used before, to compare the Random Forect Classifier and Multi-Level Perceptron (MLP) models. 

In [2]:
card = pd.read_csv('C://Users//fergu//creditcard.csv')
card.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
card.shape

(284807, 31)

In [4]:
# Highlight class imbalance
card['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [5]:
# Upsample Class 1 records:

card1 = card[card['Class'] == 1]

In [6]:
#Put in some Class 0 records:
    
card0 = card[card['Class'] == 0].sample(n=500)
card0.shape

(500, 31)

#### Final Dataset
This upsampled, balanced dataset will be used in the analysis from here on.

In [7]:
card_final = pd.concat([card0, card1], axis=0) 

In [8]:
card_final.shape

(992, 31)

In [9]:
X = card_final.drop('Class', axis=1)
Y = card_final['Class']

### Model Using Random Forest Classifier
The results are good, with a slight tendency to overfit.

In [10]:
rfc = RandomForestClassifier()
rfc.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
from sklearn.model_selection import cross_val_score
cross_val_score(rfc, X, Y, cv=5)

array([ 0.96482412,  0.91457286,  0.91414141,  0.93939394,  0.89393939])

Next run the model using 500 estimators instead of the default 10. This does not result in much further improvement:

In [25]:
rfc = RandomForestClassifier(n_jobs=-1, n_estimators=500)
rfc.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
cross_val_score(rfc, X, Y, cv=5)

array([ 0.95979899,  0.90954774,  0.92424242,  0.95959596,  0.90909091])

### Model Using Multi-Level Perceptron Classifier
Knowing that more levels tends to reduce overfitting, we try 4 layers first. The results are significantly poorer than the Random Forest Classifier. The best that seems to be possible is an accuracy of around 50%.

In [15]:
from sklearn.neural_network import MLPClassifier

In [20]:
start_time = time.time()
#fraction = 0.1
#Xsamp = X.sample(frac=fraction)
#Ysamp = Y.sample(frac=fraction)

mlp = MLPClassifier(hidden_layer_sizes=(250,250,250,250))
mlp.fit(X, Y)
print(cross_val_score(mlp, X, Y, cv=5))

time_interval = '%.1f' % ((time.time() - start_time) / 60)
print('Time taken: {} minutes'.format(time_interval))

[ 0.48743719  0.50251256  0.50505051  0.50505051  0.49494949]
Time taken: 0.1 minutes


Increasing the layer size, we see that there is no improvement in accuracy and a significant increase in computing time:

In [18]:
start_time = time.time()

mlp = MLPClassifier(hidden_layer_sizes=(5000,5000,5000,5000))
mlp.fit(X, Y)
print(cross_val_score(mlp, X, Y, cv=5))

time_interval = '%.1f' % ((time.time() - start_time) / 60)
print('Time taken: {} minutes'.format(time_interval))

[ 0.49748744  0.49748744  0.49494949  0.50505051  0.50505051]
Time taken: 34.6 minutes


Lastly we try using a different activation function, the logistic function instead of relu. This showed good results in a previous exercise, with respect to overfitting. 

In [21]:
start_time = time.time()

mlp = MLPClassifier(hidden_layer_sizes=(250,250,250,250), activation='logistic')
mlp.fit(X, Y)
print(cross_val_score(mlp, X, Y, cv=5))

time_interval = '%.1f' % ((time.time() - start_time) / 60)
print('Time taken: {} minutes'.format(time_interval))

[ 0.50251256  0.50251256  0.50505051  0.49494949  0.49494949]
Time taken: 0.1 minutes


So we now try increasing the layer size. This increases consistency but does not help overall accuracy:

In [22]:
start_time = time.time()

mlp = MLPClassifier(hidden_layer_sizes=(500,500,500,500), activation='logistic')
mlp.fit(X, Y)
print(cross_val_score(mlp, X, Y, cv=5))

time_interval = '%.1f' % ((time.time() - start_time) / 60)
print('Time taken: {} minutes'.format(time_interval))

[ 0.50251256  0.50251256  0.50505051  0.50505051  0.50505051]
Time taken: 0.5 minutes
