In [17]:
# load base packages
import pandas as pd
import pyodbc
import re
import numpy as np
# Set random seed
np.random.seed(0)

# load utils
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import class_weight

# Load Metrics
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score

# Load model libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
# import BatchNormalization
from tensorflow.keras.layers import BatchNormalization

#import optimizer
from tensorflow.keras import optimizers

### Import Pickled Data

In [4]:
dfimpalaProd = pd.read_pickle(r'data/deposit_data_consensed.pkl')
dfimpalaProd.dropna(axis = 0, how ='any', inplace=True)
dfimpalaProd.head()

Unnamed: 0,party_key,person_age,male,female,account_age_days,closed_within_next_month,account_key,trans_count_total_1,trans_count_total_dif,trans_count_deposits_1,...,transaction_amt_transfers_2,transaction_amt_fees_2,transaction_amt_misc_2,trans_post_ledger_bal_amt_2,trans_post_ledger_bal_amt_deposits_2,trans_post_ledger_bal_amt_purchases_payments_2,trans_post_ledger_bal_amt_withdrawals_2,trans_post_ledger_bal_amt_transfers_2,trans_post_ledger_bal_amt_fees_2,trans_post_ledger_bal_amt_misc_2
0,46531765,49.106849,1,0,90,1,45269591,128,-8,5,...,0.0,0.056818,0.83072,198.093521,14.256288,157.188217,14.905378,0.0,5.370076,3.113864
1,47165428,60.09589,1,0,120,1,45047095,1,-4,0,...,0.0,0.0,0.0,1480.333374,0.0,0.0,1174.333374,0.0,0.0,0.0
2,37451305,32.739726,0,0,1084,1,37156147,5,0,3,...,0.0,0.0,0.0,52.5,31.0,0.0,21.5,0.0,0.0,0.0
3,46542092,69.550685,1,0,195,1,44386125,1,-1,0,...,0.0,0.0,0.043333,5201.09668,0.0,2600.526611,0.0,0.0,0.0,2600.570068
4,39019005,89.726027,1,0,4221,1,35940050,11,-2,1,...,0.0,0.4125,0.014167,24166.349609,1879.345459,9264.235352,7461.556152,0.0,3707.39624,1853.81543


### Check Percent of Closed Accounts

In [5]:
dfimpalaProd['closed_within_next_month'].mean()

0.005355429158039286

In [6]:
y = dfimpalaProd['closed_within_next_month']
X = dfimpalaProd.drop(['closed_within_next_month','party_key','account_key'], axis=1)
ids = dfimpalaProd[['closed_within_next_month','party_key','account_key']]

scaler = preprocessing.StandardScaler()

scaler = scaler.fit(X)

X = scaler.transform(X)
#consider removing difference features
#consider log transform of all numerics
#Visualize features and consider removing variables with equal distribution by label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=42)

  return self.partial_fit(X, y)
  if __name__ == '__main__':


In [7]:
from collections import Counter
# set class weights
def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: float(majority/count) for cls, count in counter.items()}

class_weight_dict = get_class_weights(y_train)
class_weight_dict

{0: 1.0, 1: 185.73242483235995}

In [19]:
# Set the number of features we want
number_of_features = X_train.shape[1]


model = Sequential()
# act = keras.layers.PReLU(alpha_initializer='zeros')
model.add(Dense(64, input_dim=number_of_features, activation='relu'))
#model.add(Dropout(0.025))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.025))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.025))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.025))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
#model.add(Dropout(0.025))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))
#trim network depth & batch normalization (layer normalization) standardizing inputs and outputs between learnable layers & size of hidden layers & remove dropout

# setting up the optimization of our weights 
Nadam = optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
model.compile(loss='binary_crossentropy',
              optimizer=Nadam, #create an optimizer tf.keras.Optimizers
              metrics=['accuracy'])

# history = model.fit(X_train, y_train,
#           epochs=1,
#           batch_size=128, 
#            class_weight=class_weight_dict
        
#           # learning rate decay
#          )



# running the fitting
model.fit(X_train, y_train, epochs=6, validation_split=0.2, verbose = 1,
         batch_size=128, 
           class_weight=class_weight_dict)

#create validation set to tune/ don't use test until end
# score = model.evaluate(X_test, y_test, batch_size=128)

Train on 1381222 samples, validate on 345306 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x172d150ef28>

In [20]:
y_pred = model.predict_classes(X_test)

In [21]:
print('Precision Score')
print(precision_score(y_test, y_pred))

print('_'*100)
print('Recall Score')
print(recall_score(y_test, y_pred))

print('_'*100)
print('Balanced Accuracy Score')
print(balanced_accuracy_score(y_test, y_pred))

print('_'*100)
print('Confusion Matrix')
print(pd.DataFrame(confusion_matrix(y_test, y_pred), index=['true:no', 'true:yes'], columns=['pred:no', 'pred:yes']))

Precision Score
0.04347307531755024
____________________________________________________________________________________________________
Recall Score
0.5518546555639667
____________________________________________________________________________________________________
Balanced Accuracy Score
0.7432361332082325
____________________________________________________________________________________________________
Confusion Matrix
          pred:no  pred:yes
true:no    687858     48120
true:yes     1776      2187
