In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn import preprocessing
%matplotlib inline

In [2]:
warnings.filterwarnings("ignore")

# Preprocessing data

In [8]:
training_data = pd.read_csv('training_data')

In [9]:
data = training_data.copy()

In [10]:
y = data.y_labels

In [11]:
X = data.drop(['y_labels'], axis = 1)

In [12]:
scaler = preprocessing.StandardScaler()
scaler.fit(X) 
X = pd.DataFrame(scaler.transform(X),columns =X.columns)
X.describe()

Unnamed: 0,Annual_revenue,D&b_annual_revenue,Fortune_100,Fortune_500,Ultimate_parent_zoominfo_revenue,Zoominfo_employee_count,Zoominfo_finance_department_budget,Zoominfo_it_department_budget,Zoominfo_past_1_yr_employee_growth_rate,Zoominfo_past_2yr_employee_growth_rate,...,Sub_industry_Investment,Sub_industry_Materials,Sub_industry_Medical Devices,Sub_industry_Non-Profit,Sub_industry_Other Business Services,Sub_industry_Pharmaceuticals,Sub_industry_Providers,Sub_industry_Real Estate,Sub_industry_Software,Sub_industry_Utilities
count,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,...,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0,35332.0
mean,3.710311e-17,6.650356000000001e-17,-1.568718e-15,1.319218e-15,-1.59323e-16,-1.8387720000000002e-17,-3.9191040000000006e-17,8.392503000000001e-17,-1.524951e-15,-1.253449e-15,...,-4.492075e-15,-2.560365e-15,5.854912e-15,-6.262711e-16,-1.0196e-16,-8.194949e-15,1.06396e-14,6.026376e-15,-1.79316e-15,-2.199476e-15
std,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,...,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014,1.000014
min,-0.005724427,-0.2204103,-0.03531123,-0.08921533,-0.2039659,-0.1553461,-0.1796478,-0.1858074,-3.767791,-1.540401,...,-0.2152116,-0.1287349,-0.09300525,-0.1200536,-0.1912721,-0.1383948,-0.240969,-0.1479677,-0.2253127,-0.123642
25%,-0.005711367,-0.1008149,-0.03531123,-0.08921533,-0.2022912,-0.1520364,-0.1771095,-0.1831259,-0.3619453,-0.4817265,...,-0.2152116,-0.1287349,-0.09300525,-0.1200536,-0.1912721,-0.1383948,-0.240969,-0.1479677,-0.2253127,-0.123642
50%,-0.005688599,-0.09481923,-0.03531123,-0.08921533,-0.1968818,-0.1434906,-0.1695083,-0.1753884,-0.2740752,-0.3120809,...,-0.2152116,-0.1287349,-0.09300525,-0.1200536,-0.1912721,-0.1383948,-0.240969,-0.1479677,-0.2253127,-0.123642
75%,-0.005620237,-0.07638596,-0.03531123,-0.08921533,-0.1657198,-0.1086508,-0.1355355,-0.139437,0.1020089,0.0986117,...,-0.2152116,-0.1287349,-0.09300525,-0.1200536,-0.1912721,-0.1383948,-0.240969,-0.1479677,-0.2253127,-0.123642
max,187.9636,142.3647,28.3196,11.20884,20.69891,40.12187,31.12794,30.90423,34.63848,13.732,...,4.646589,7.7679,10.75208,8.329614,5.228155,7.225707,4.149911,6.758233,4.438277,8.087863


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,  test_size=0.3,random_state=1)
print(X_train.shape,X_test.shape,y_train.shape, y_test.shape)

(24732, 59) (10600, 59) (24732,) (10600,)


# Training MLP Classifier 


In [14]:
mlp = MLPClassifier(random_state=1)
mlp.fit(X_train, y_train)
y_predict_mlp = mlp.predict(X_test)

In [18]:
def binClassScore(predictions, actuals):
    results = pd.crosstab(predictions, actuals)
    tp = results[1][1]
    tn = results[0][0]
    fp = results[0][1]
    fn = results[1][0]
    n = results.sum().sum()

    acuracy = (tp + tn)/n
    recall = tp/(tp+fn)
    precision = tp/(tp+fp)
    f1 = 2*(precision*recall)/(precision-recall)
    return acuracy, recall, precision, f1

In [19]:
score = binClassScore(y_predict_mlp,y_test)
score

(0.9619811320754716,
 0.13793103448275862,
 0.5137614678899083,
 0.37710437710437705)

# Parameter Tuning
## 1.) Tuning hidden layer size, activation function, solver type, and learning rate

In [37]:
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=1)

param_grid = {
    'hidden_layer_sizes':[100 , 50, (100,100,100), (50,50,50), (25,25,25)],
    'activation': ['logistic','tanh','relu'],
    'solver': ['lbfgs','adam'],
    'learning_rate':['invscaling','adaptive']}

gridSearch = GridSearchCV(MLPClassifier(), param_grid, cv=cv,
                          scoring=['recall','f1'],refit='recall',verbose=2)
gridSearch.fit(X_train, y_train)
print('Score: ', gridSearch.best_score_)
print('Parameters: ', gridSearch.best_params_)

Fitting 1 folds for each of 60 candidates, totalling 60 fits
[CV] END activation=logistic, hidden_layer_sizes=100, learning_rate=invscaling, solver=lbfgs; total time=   6.4s
[CV] END activation=logistic, hidden_layer_sizes=100, learning_rate=invscaling, solver=adam; total time=  11.6s
[CV] END activation=logistic, hidden_layer_sizes=100, learning_rate=adaptive, solver=lbfgs; total time=   6.6s
[CV] END activation=logistic, hidden_layer_sizes=100, learning_rate=adaptive, solver=adam; total time=  11.5s
[CV] END activation=logistic, hidden_layer_sizes=50, learning_rate=invscaling, solver=lbfgs; total time=   3.3s
[CV] END activation=logistic, hidden_layer_sizes=50, learning_rate=invscaling, solver=adam; total time=   8.5s
[CV] END activation=logistic, hidden_layer_sizes=50, learning_rate=adaptive, solver=lbfgs; total time=   3.3s
[CV] END activation=logistic, hidden_layer_sizes=50, learning_rate=adaptive, solver=adam; total time=   8.7s
[CV] END activation=logistic, hidden_layer_sizes=(1

## 2.) Tuning alpha, learning rate init's, max iterations, and shuffle

In [44]:
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
param_grid = {
    'hidden_layer_sizes':[50],
    'activation': ['tanh'],
    'solver': ['lbfgs'],
    'alpha': [0,0.0005,0.0001,0.0005,0.001],
    'learning_rate':['adaptive'],
    'learning_rate_init': [0,0.0001,0.0005,0.001, 0.005,0.01],
    'max_iter': [200,500],
    'shuffle': [True,False] 
}

gridSearch = GridSearchCV(MLPClassifier(), param_grid, cv=cv,
                          scoring='recall',verbose=2)
gridSearch.fit(X_train, y_train)
print('Score: ', gridSearch.best_score_)
print('Parameters: ', gridSearch.best_params_)

Fitting 1 folds for each of 120 candidates, totalling 120 fits
[CV] END activation=tanh, alpha=0, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0, max_iter=200, shuffle=True, solver=lbfgs; total time=   0.0s
[CV] END activation=tanh, alpha=0, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0, max_iter=200, shuffle=False, solver=lbfgs; total time=   0.0s
[CV] END activation=tanh, alpha=0, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0, max_iter=500, shuffle=True, solver=lbfgs; total time=   0.0s
[CV] END activation=tanh, alpha=0, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0, max_iter=500, shuffle=False, solver=lbfgs; total time=   0.0s
[CV] END activation=tanh, alpha=0, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=200, shuffle=True, solver=lbfgs; total time=   6.5s
[CV] END activation=tanh, alpha=0, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.00

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=200, shuffle=True, solver=lbfgs; total time=   6.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=200, shuffle=False, solver=lbfgs; total time=   6.3s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=500, shuffle=True, solver=lbfgs; total time=  15.9s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=500, shuffle=False, solver=lbfgs; total time=  16.0s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0005, max_iter=200, shuffle=True, solver=lbfgs; total time=   6.1s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0005, max_iter=

[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=200, shuffle=True, solver=lbfgs; total time=   6.3s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=200, shuffle=False, solver=lbfgs; total time=   6.4s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=500, shuffle=True, solver=lbfgs; total time=  15.8s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0001, max_iter=500, shuffle=False, solver=lbfgs; total time=  16.4s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0005, max_iter=200, shuffle=True, solver=lbfgs; total time=   7.0s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=50, learning_rate=adaptive, learning_rate_init=0.0005, max_iter=200, s

# Model validation
## Checking if random state has disproportionate influence

In [45]:
X_train_s, X_tune, y_train_s, y_tune = train_test_split(X_train, y_train,
                                                        test_size=0.2)
for i in range(15):
    mlp = MLPClassifier(activation='tanh', alpha = 0.005, hidden_layer_sizes= [50],
                        learning_rate='adaptive',learning_rate_init = 0.005, max_iter = 500, shuffle = False,
                        solver='lbfgs',  random_state=i)
    
    mlp.fit(X_train_s, y_train_s)
    y_predict_mlp = mlp.predict(X_tune)

    result = binClassScore(y_predict_mlp,y_tune)
    print(f'{i}\t Accuracy: {result[0]}, Recall: {result[1]}, Precision: {result[2]}, F-Score: {result[3]}')  

0	 Accuracy: 0.9338993329290479, Recall: 0.16113744075829384, Precision: 0.18478260869565216, F-Score: 2.5185185185185195
1	 Accuracy: 0.9355164746310896, Recall: 0.14691943127962084, Precision: 0.18235294117647058, F-Score: 1.5121951219512193
2	 Accuracy: 0.9334950475035375, Recall: 0.12796208530805686, Precision: 0.1569767441860465, F-Score: 1.384615384615384
3	 Accuracy: 0.9401657570244593, Recall: 0.16587677725118483, Precision: 0.22580645161290322, F-Score: 1.2499999999999998
4	 Accuracy: 0.9387507580351728, Recall: 0.16113744075829384, Precision: 0.2125, F-Score: 1.3333333333333335
5	 Accuracy: 0.9345057610673135, Recall: 0.17535545023696683, Precision: 0.19786096256684493, F-Score: 3.0833333333333326
6	 Accuracy: 0.9359207600566, Recall: 0.16587677725118483, Precision: 0.19886363636363635, F-Score: 2.0
7	 Accuracy: 0.9361229027693552, Recall: 0.13744075829383887, Precision: 0.17791411042944785, F-Score: 1.2083333333333335
8	 Accuracy: 0.9375379017586416, Recall: 0.18957345971563

## Final model

In [46]:
mlp = MLPClassifier(hidden_layer_sizes= [50] ,activation='tanh',
                        learning_rate='adaptive', solver='lbfgs',  random_state=1)

mlp.fit(X_train_s, y_train_s)
y_predict_mlp = mlp.predict(X_test)
result = binClassScore(y_predict_mlp,y_test)
print(f'Accuracy: {result[0]}, Recall: {result[1]}, Precision: {result[2]}, F-Score: {result[3]}')

Accuracy: 0.9492452830188679, Recall: 0.1748768472906404, Precision: 0.2591240875912409, F-Score: 1.075757575757576
