In [153]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
import time
%matplotlib inline

In [17]:
df = pd.read_csv('data_norm_final.csv')

In [18]:
df = df.drop('Unnamed: 0',axis=1)

In [19]:
cols = [i for i in df.columns if i != 'fraud_label']
cols.append('fraud_label')
df = df[cols]
df.head()

Unnamed: 0,record,date,address-zip5-homephone//days_since,address-zip5-homephone//prev_d14_count,address-zip5//days_since,address-zip5//prev_d14_count,address-zip5//prev_d1_count,address-zip5//prev_d1_d30_avg,address-zip5//prev_d30_count,address-zip5//prev_d3_count,...,name-dob//prev_d14_count,ssn-dob//days_since,ssn-firstname//days_since,ssn-lastname//days_since,ssn-lastname//prev_d30_count,ssn-name-dob//prev_d30_count,ssn-name//days_since,ssn-name//prev_d30_count,ssn//days_since,fraud_label
0,1,2016-01-01,-1.566892,-0.070089,-1.54593,-0.079045,-0.049665,0.174181,-0.101958,-0.057932,...,-0.0686,-1.573035,-1.558702,-1.558716,-0.097983,-0.092594,-1.559231,-0.097773,-1.556639,0
1,2,2016-01-01,-1.566892,-0.070089,-1.54593,-0.079045,-0.049665,0.174181,-0.101958,-0.057932,...,-0.0686,-1.573035,-1.558702,-1.558716,-0.097983,-0.092594,-1.559231,-0.097773,-1.556639,1
2,3,2016-01-01,-1.566892,-0.070089,-1.54593,-0.079045,-0.049665,0.174181,-0.101958,-0.057932,...,-0.0686,-1.573035,-1.558702,-1.558716,-0.097983,-0.092594,-1.559231,-0.097773,-1.556639,0
3,4,2016-01-01,-1.566892,-0.070089,-1.54593,-0.079045,-0.049665,0.174181,-0.101958,-0.057932,...,-0.0686,-1.573035,-1.558702,-1.558716,-0.097983,-0.092594,-1.559231,-0.097773,-1.556639,0
4,5,2016-01-01,-1.566892,-0.070089,-1.54593,-0.079045,-0.049665,0.174181,-0.101958,-0.057932,...,-0.0686,-1.573035,-1.558702,-1.558716,-0.097983,-0.092594,-1.559231,-0.097773,-1.556639,0


Before running code below, make sure the columns in your dataframe is similar to the one above, where:
 - Column 0 = record
 - Column 1 = date
 - Last Column = fraud_label

# Train/Test/OOT Split

In [142]:
tt = df[(df.date > '2016-01-14') & (df.date < '2016-11-01')].copy()
oot = df[(df.date >= '2016-11-01')].copy()

In [143]:
X, y = tt.iloc[:,2:-1].values, tt.iloc[:,-1].values

In [144]:
# OOT Validation Data
X_oot = oot.iloc[:,2:-1]
y_oot = oot.iloc[:,-1]

In [145]:
# Create a bunch of different train/test splits
def gen_data(n = 5):
    # Create n sets of train_test splits
    dataTest = dict()
    for i in range(n):
        # i is the random seed used in each train/test split below
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = i)
        dataTest[i] = dict()
        dataTest[i]['X_train'] = X_train
        dataTest[i]['X_test'] = X_test
        dataTest[i]['y_train'] = y_train
        dataTest[i]['y_test'] = y_test
        # Upsample minority class
        df_resamp = pd.DataFrame(X_train, columns = [i for i in tt.columns if i not in ['fraud_label', 'date', 'record']])
        df_resamp['fraud_label'] = y_train
        # Separate majority/minority
        df_majority = df_resamp[df_resamp.fraud_label == 0]
        df_minority = df_resamp[df_resamp.fraud_label == 1]
        # Upsample minority
        df_minority_upsampled = resample(df_minority, replace = True, 
                                         n_samples = int(df_majority.fraud_label.count()), # Upsample so minority = majority
                                         random_state = i)
        # Combine
        df_upsampled = pd.concat([df_majority, df_minority_upsampled])
        df_upsampled.fraud_label.value_counts()
        dataTest[i]['X_train_upsampled'] = df_upsampled.iloc[:,:-1]
        dataTest[i]['y_train_upsampled'] = df_upsampled.iloc[:,-1]
    return dataTest

In [146]:
#defining evaluation metric - FDR 3%
def scoring(model, X_train, y_train, X_test, y_test, ootX, ootY, top_percent = 3):
    train_pred = model.predict_proba(X_train)[:, 1]
    train_actual_predict = pd.DataFrame({'pred': train_pred, 'actual': y_train})
    fdr_train = (train_actual_predict.sort_values('pred', ascending=False) > 0.5).head(int(round(len(train_actual_predict) * 0.01 * top_percent)))['actual'].sum() / y_train.sum()
    
    test_pred = model.predict_proba(X_test)[:, 1]
    test_actual_predict = pd.DataFrame({'pred': test_pred, 'actual': y_test})
    fdr_test = (test_actual_predict.sort_values('pred', ascending=False) > 0.5).head(int(round(len(test_actual_predict) * 0.01 * top_percent)))['actual'].sum() / y_test.sum()
    
    oot_pred = model.predict_proba(X_oot)[:, 1]
    oot_actual_predict = pd.DataFrame({'pred': oot_pred, 'actual': y_oot}) 
    fdr_oot = (oot_actual_predict.sort_values('pred', ascending=False) > 0.5).head(int(round(len(oot_actual_predict) * 0.01 * top_percent)))['actual'].sum() / y_oot.sum()
    
    scoring_df = {'Name': type(model).__name__, 'Training FDR': fdr_train, 'Test FDR': fdr_test, 'OOT_FDR': fdr_oot}
    
    return scoring_df

# Define Models

In [171]:
# Logistic
mlog = LogisticRegression()

# Random Forest - Coggeshall Params
mrf1 = RandomForestClassifier(n_estimators=50, max_depth = 20, max_features = 10, verbose = True)

# Random Forest - Andrew Params
mrf2 = RandomForestClassifier(bootstrap = False, ccp_alpha = 0.0, class_weight = None, criterion = 'gini', 
                             max_depth = 5, max_features = 5, max_leaf_nodes = None, max_samples = None,
                             min_impurity_decrease = 0.0, min_impurity_split = None, min_samples_leaf = 2,
                             min_samples_split = 3, min_weight_fraction_leaf = 0.0, n_estimators = 100,
                             n_jobs = None, oob_score = False, random_state = None, verbose = 0, 
                             warm_start = False)

# Neural Net - Coggeshall Params
mnn1 = MLPClassifier(solver ='lbfgs', alpha=1e-5, hidden_layer_sizes=(6,), random_state=1, verbose = True)

# Neural Net - Andrew Params
mnn2 = MLPClassifier(solver = 'adam', learning_rate = 'adaptive', 
                          hidden_layer_sizes = (15,20), alpha = 0.0001, activation = 'tanh')

# Decision Tree
mt = DecisionTreeClassifier(min_samples_split = 300, min_samples_leaf = 60,random_state = 1)

# Boosted Trees
mada1 = AdaBoostClassifier(learning_rate = 0.001, random_state = 1)
mada2 = AdaBoostClassifier(learning_rate = 0.002, random_state = 1)

# Test Models

In [167]:
def test_model(model, data):
    main_start = time.time()
    # Change n to the number of different test/train data sets you want to create
    avg_train, avg_test, avg_oot = 0, 0, 0
    n = len(data)
    for i,v in data.items():
        print(f'Running Iteration: {i}')
        inter_start = time.time()
        model.fit(v['X_train_upsampled'], v['y_train_upsampled'])
        scores = scoring(model = model, top_percent = 3, 
                      X_train = v['X_train'], y_train = v['y_train'],
                      X_test = v['X_test'], y_test = v['y_test'],
                      ootX = X_oot, ootY = y_oot)
        avg_train += scores['Training FDR']
        avg_test += scores['Test FDR']
        avg_oot += scores['OOT_FDR']
        inter_end = time.time()
        print(f' -Done: {round(inter_end-inter_start, 5)}s')
    main_end = time.time()
    print()
    print('Average FDRs:')
    print(f' -Train: {round(avg_train/n,4)}')
    print(f' -Test: {round(avg_test/n,4)}')
    print(f' -OOT: {round(avg_oot/n,4)}')
    print(f'Total Runtime: {round(main_end-main_start, 5)}s')

In [175]:
# Gen 3 Diff Train/Test splits
testdata = gen_data(n=3)

In [170]:
# Run models

In [177]:
test_model(mlog, testdata)

Running Iteration: 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 -Done: 10.91377s
Running Iteration: 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 -Done: 10.73528s
Running Iteration: 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 -Done: 10.52385s

Average FDRs:
 -Train: 0.5302
 -Test: 0.5183
 -OOT: 0.5095
Total Runtime: 32.17689s


In [None]:
test_model(mrf1, testdata)

Running Iteration: 0


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
test_model(mrf2, testdata)

In [None]:
test_model(mnn1, testdata)

In [None]:
test_model(mnn2, testdata)

In [None]:
test_model(mt, testdata)

In [None]:
test_model(mada1, testdata)

In [None]:
test_model(mada2, testdata)