In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import feature_selection as fs
from sklearn import linear_model
import sklearn.metrics as sklm
from datetime import date

%matplotlib inline

def calc_age(birthday):
    today = date(1998, 1, 1)
    birthday = list(map(int, birthday.split('-')))
    born = date(birthday[0], birthday[1], birthday[2])
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

import_features = pd.read_csv('AdvWorksCusts.csv')
import_labels = pd.read_csv('AW_BikeBuyer.csv')
import_features = import_features.drop_duplicates(subset='CustomerID')
import_labels = import_labels.drop_duplicates(subset='CustomerID')

temp = import_features.merge(import_labels, how='left', on='CustomerID')
temp['Age'] = [calc_age(birthday) for birthday in temp['BirthDate']]

cat_cols = ['Education', 'Occupation', 'Gender', 'MaritalStatus']
num_cols = ['Age', 'YearlyIncome', 'TotalChildren', 'NumberChildrenAtHome', 'NumberCarsOwned']
cat_already_encoded = ['HomeOwnerFlag']

def encode_string(cat_features):
    ohe = preprocessing.OneHotEncoder(categories = 'auto')
    encoded = ohe.fit(cat_features)
    return encoded.transform(cat_features).toarray()

Labels = np.array(temp['BikeBuyer'])

# Features now has 15 categorical features, that have been one hot encoded
Features = encode_string(temp[cat_cols])
Features = np.concatenate([Features, np.array(temp[cat_already_encoded])], axis=1)
# concatenate numeric features
Features = np.concatenate([Features, np.array(temp[num_cols])], axis=1)

# create training and testing samples
nr.seed(9988)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = .3)
X_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

# scale numerics
scaler = preprocessing.StandardScaler().fit(X_train[:,15:])
X_train[:,15:] = scaler.transform(X_train[:,15:])
X_test[:,15:] = scaler.transform(X_test[:,15:])
#X_train[:2,]
print(Features.shape)

(16404, 20)


In [7]:
nr.seed(1115)
nn_mod = MLPClassifier(hidden_layer_sizes = (50,), beta_1=.9, beta_2=.9)
nn_mod.fit(X_train, y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.9, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [8]:
scores = nn_mod.predict(X_test)
def print_metrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy  %0.2f' % sklm.accuracy_score(labels, scores))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])


    
print_metrics(y_test, scores) 

                 Confusion matrix
                 Score positive    Score negative
Actual positive      2910               359
Actual negative       645              1008

Accuracy  0.80
 
           Positive      Negative
Num case     3269          1653
Precision    0.82          0.74
Recall       0.89          0.61
F1           0.85          0.67


# Let's try oversampling the minority and using cross validation

In [4]:
temp_Labels = Labels[Labels == 1] 
temp_Features = Features[Labels == 1,:]
temp_Features = np.concatenate((Features, temp_Features), axis = 0)
temp_Labels = np.concatenate((Labels, temp_Labels), axis = 0) 

print(temp_Features.shape)
print(temp_Labels.shape)

(21855, 20)
(21855,)


In [5]:
nr.seed(123)
inside = ms.KFold(n_splits=3, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=3, shuffle = True)

In [6]:
## Define the dictionary for the grid search and the model object to search on
param_grid = {#"alpha":[0.0000001,0.000001,0.00001], 
              #"early_stopping":[True, False], 
              "beta_1":[0.95,0.90,0.80], 
              "beta_2":[0.999,0.9,0.8]}

## Define the Neural Network model
nn_clf = MLPClassifier(hidden_layer_sizes = (50,),
                       max_iter=300)

## Perform the grid search over the parameters
nr.seed(3456)
nn_clf = ms.GridSearchCV(estimator = nn_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'recall',
                      return_train_score = True)

nr.seed(6677)
nn_clf.fit(temp_Features, temp_Labels)
print(nn_clf.best_estimator_.alpha)
print(nn_clf.best_estimator_.early_stopping)
print(nn_clf.best_estimator_.beta_1)
print(nn_clf.best_estimator_.beta_2)



0.0001
False
0.9
0.9




In [None]:
nr.seed(498)
cv_estimate = ms.cross_val_score(nn_clf, temp_Features, temp_Labels, 
                                 cv = outside) # Use the outside folds

print('Mean performance metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))