In [184]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import scipy.stats as sps
import random

In [2]:
data = pd.read_csv('all_candidate_variabels.csv')
data = data.drop(columns = ['Unnamed: 0'])

In [3]:
vars_final = pd.read_csv('final_vars.csv')

In [7]:
train_test = data[data.date < '2016-11-01']
oot = data[data.date > '2016-11-01']

In [8]:
y_train_test = train_test['fraud_label']
y_oot = oot['fraud_label']

In [10]:
x = data[list(vars_final['variable'].values)]

In [11]:
x_train_test = train_test[list(vars_final['variable'].values)]
x_oot = oot[list(vars_final['variable'].values)]

In [14]:
scaler = StandardScaler().fit(x)
x_train_test = scaler.transform(x_train_test)
x_oot = scaler.transform(x_oot)

In [76]:
def FDR(df):
    topRows = int(round(len(df)*0.03))
    bads = df.loc[df['fraud_label'] == 1]
    numbads = len(bads)
    
    temp = df[['fraud_proba','fraud_label']].copy()
    temp0 = temp.sort_values('fraud_proba',ascending=False)
    temp1 = temp0.head(topRows)
    temp2 = temp0.tail(topRows)
    needed1 = temp1.loc[:,'fraud_label']
    needed2 = temp2.loc[:,'fraud_label']
    FDR1 = sum(needed1)/numbads
    FDR2 = sum(needed2)/numbads
    FDRate = np.maximum(FDR1,FDR2) 
    
    return FDRate

**Logistic Model**

In [140]:
result_log = []

for size in [0.3,0.4]:
    X_train, X_test, y_train, y_test = train_test_split(
        x_train_test, y_train_test, test_size = size, random_state = 0)
    for c in [0.001,0.01,0.1,1,10]:
        model = LogisticRegression(penalty = 'l2', solver='lbfgs', class_weight = 'balanced', C = c)
        clf = model.fit(X_train, y_train)

        y_train_hat = clf.predict_proba(X_train)
        train = pd.DataFrame(X_train, columns = list(vars_final['variable'].values))
        train['fraud_label'] = y_train.values
        train['fraud_proba'] = y_train_hat[:,1]
        train_fdr = FDR(train)

        y_test_hat = clf.predict_proba(X_test)
        test = pd.DataFrame(X_test, columns = list(vars_final['variable'].values))
        test['fraud_label'] = y_test.values
        test['fraud_proba'] = y_test_hat[:,1]
        test_fdr = FDR(test)

        y_oot_hat = clf.predict_proba(x_oot)
        oot = pd.DataFrame(x_oot, columns = list(vars_final['variable'].values))
        oot['fraud_label'] = y_oot.values
        oot['fraud_proba'] = y_oot_hat[:,1]
        oot_fdr = FDR(oot)
        
        dict_log = dict(test_size = size, regularization_para = c, TRAIN = train_fdr, TEST = test_fdr, OOT = oot_fdr)
        result_log.append(dict_log)

In [141]:
result_log = pd.DataFrame(result_log)

In [142]:
result_log

Unnamed: 0,test_size,regularization_para,TRAIN,TEST,OOT
0,0.3,0.001,0.563342,0.574392,0.554797
1,0.3,0.01,0.563818,0.576881,0.556077
2,0.3,0.1,0.564891,0.575498,0.556077
3,0.3,1.0,0.564176,0.574945,0.556077
4,0.3,10.0,0.564057,0.574945,0.556077
5,0.4,0.001,0.563575,0.571017,0.554797
6,0.4,0.01,0.566093,0.572046,0.55565
7,0.4,0.1,0.564554,0.571017,0.55565
8,0.4,1.0,0.564554,0.571429,0.555224
9,0.4,10.0,0.565254,0.571634,0.555224


In [122]:
#result_log.to_csv('result_log.csv')

**Neural Network**

In [127]:
result_nn = []

for size in [0.3,0.4]:
    for h in [(100,1),(100,2),(100,3)]:
        for m in [50,100,200]:
            X_train, X_test, y_train, y_test = train_test_split(
                x_train_test, y_train_test, test_size = size, random_state = 0)
            model = MLPClassifier(hidden_layer_sizes = h, max_iter = m, solver = 'adam')
            clf = model.fit(X_train, y_train)

            y_train_hat = clf.predict_proba(X_train)
            train = pd.DataFrame(X_train, columns = list(vars_final['variable'].values))
            train['fraud_label'] = y_train.values
            train['fraud_proba'] = y_train_hat[:,1]
            train_fdr = FDR(train)

            y_test_hat = clf.predict_proba(X_test)
            test = pd.DataFrame(X_test, columns = list(vars_final['variable'].values))
            test['fraud_label'] = y_test.values
            test['fraud_proba'] = y_test_hat[:,1]
            test_fdr = FDR(test)

            y_oot_hat = clf.predict_proba(x_oot)
            oot = pd.DataFrame(x_oot, columns = list(vars_final['variable'].values))
            oot['fraud_label'] = y_oot.values
            oot['fraud_proba'] = y_oot_hat[:,1]
            oot_fdr = FDR(oot)

            dict_nn = dict(test_size = size, hidden_layer = h, epoch = m,\
                            TRAIN = train_fdr, TEST = test_fdr, OOT = oot_fdr)
            result_nn.append(dict_nn)

In [128]:
result_nn = pd.DataFrame(result_nn)

In [129]:
result_nn

Unnamed: 0,test_size,hidden_layer,epoch,TRAIN,TEST,OOT
0,0.3,"(100, 1)",50,0.562746,0.572456,0.550533
1,0.3,"(100, 1)",100,0.565725,0.575221,0.553518
2,0.3,"(100, 1)",200,0.56644,0.575498,0.554797
3,0.3,"(100, 2)",50,0.566798,0.574668,0.554371
4,0.3,"(100, 2)",100,0.56799,0.576881,0.554797
5,0.3,"(100, 2)",200,0.567394,0.576327,0.55693
6,0.3,"(100, 3)",50,0.567274,0.576327,0.557356
7,0.3,"(100, 3)",100,0.567394,0.577434,0.555224
8,0.3,"(100, 3)",200,0.567394,0.575774,0.555224
9,0.4,"(100, 1)",50,0.566233,0.569988,0.554797


In [130]:
#result_nn.to_csv('result_nn.csv')

**Random Forest**

In [131]:
result_rf = []

for size in [0.3,0.4]:
    for n in [50,100,150]:
        for m in [5,7,9]:
            X_train, X_test, y_train, y_test = train_test_split(
                x_train_test, y_train_test, test_size = size, random_state = 0)
            model = RandomForestClassifier(n_estimators = n, max_depth = m)
            clf = model.fit(X_train, y_train)

            y_train_hat = clf.predict_proba(X_train)
            train = pd.DataFrame()
            train['fraud_label'] = y_train.values
            train['fraud_proba'] = y_train_hat[:,1]
            train_fdr = FDR(train)

            y_test_hat = clf.predict_proba(X_test)
            test = pd.DataFrame()
            test['fraud_label'] = y_test.values
            test['fraud_proba'] = y_test_hat[:,1]
            test_fdr = FDR(test)

            y_oot_hat = clf.predict_proba(x_oot)
            oot = pd.DataFrame()
            oot['fraud_label'] = y_oot.values
            oot['fraud_proba'] = y_oot_hat[:,1]
            oot_fdr = FDR(oot)

            dict_rf = dict(test_size = size, num_of_tree = n, max_depth = m,\
                            TRAIN = train_fdr, TEST = test_fdr, OOT = oot_fdr)
            result_rf.append(dict_rf)

In [132]:
result_rf = pd.DataFrame(result_rf)

In [133]:
result_rf

Unnamed: 0,test_size,num_of_tree,max_depth,TRAIN,TEST,OOT
0,0.3,50,5,0.553093,0.565265,0.550959
1,0.3,50,7,0.559171,0.571903,0.551812
2,0.3,50,9,0.559528,0.571903,0.552665
3,0.3,100,5,0.557979,0.57052,0.548401
4,0.3,100,7,0.558932,0.57135,0.550107
5,0.3,100,9,0.559766,0.571626,0.551812
6,0.3,150,5,0.558217,0.570796,0.550107
7,0.3,150,7,0.558217,0.57135,0.550107
8,0.3,150,9,0.560124,0.572456,0.551812
9,0.4,50,5,0.554203,0.56196,0.550107


In [190]:
#result_rf.to_csv('result_rf.csv')

**Gradient Boosting**

In [137]:
result_gb = []

for size in [0.3,0.4]:
    for n in [500,700,900]:
        for m in [1,2]:
            for l in [0.01,0.1]:
                X_train, X_test, y_train, y_test = train_test_split(
                    x_train_test, y_train_test, test_size = size, random_state = 0)
                model = GradientBoostingClassifier(n_estimators = n, max_depth = m, learning_rate = l)
                clf = model.fit(X_train, y_train)

                y_train_hat = clf.predict_proba(X_train)
                train = pd.DataFrame(X_train, columns = list(vars_final['variable'].values))
                train['fraud_label'] = y_train.values
                train['fraud_proba'] = y_train_hat[:,1]
                train_fdr = FDR(train)

                y_test_hat = clf.predict_proba(X_test)
                test = pd.DataFrame(X_test, columns = list(vars_final['variable'].values))
                test['fraud_label'] = y_test.values
                test['fraud_proba'] = y_test_hat[:,1]
                test_fdr = FDR(test)

                y_oot_hat = clf.predict_proba(x_oot)
                oot = pd.DataFrame(x_oot, columns = list(vars_final['variable'].values))
                oot['fraud_label'] = y_oot.values
                oot['fraud_proba'] = y_oot_hat[:,1]
                oot_fdr = FDR(oot)

                dict_gb = dict(test_size = size, num_of_tree = n, max_depth = m, learning_rate = l,\
                                TRAIN = train_fdr, TEST = test_fdr, OOT = oot_fdr)
                result_gb.append(dict_gb)

In [138]:
result_gb = pd.DataFrame(result_gb)

In [139]:
result_gb

Unnamed: 0,test_size,num_of_tree,max_depth,learning_rate,TRAIN,TEST,OOT
0,0.3,500,1,0.01,0.559409,0.57135,0.544563
1,0.3,500,1,0.1,0.561197,0.57135,0.548827
2,0.3,500,2,0.01,0.558217,0.568584,0.551386
3,0.3,500,2,0.1,0.564653,0.576051,0.551386
4,0.3,700,1,0.01,0.558694,0.572456,0.548401
5,0.3,700,1,0.1,0.560958,0.572179,0.548827
6,0.3,700,2,0.01,0.561197,0.572179,0.551386
7,0.3,700,2,0.1,0.564772,0.575221,0.553945
8,0.3,900,1,0.01,0.558455,0.571903,0.551386
9,0.3,900,1,0.1,0.561077,0.573285,0.550959


In [167]:
#result_gb.to_csv('result_gb.csv')

**AdaBoost**

In [185]:
result_ada = []

for size in [0.3,0.4]:
    for n in [500,700,900]:
        for l in [0.01,0.1,1]:
            X_train, X_test, y_train, y_test = train_test_split(
                x_train_test, y_train_test, test_size = size, random_state = 0)
            model = AdaBoostClassifier(n_estimators = n, learning_rate = l)
            clf = model.fit(X_train, y_train)

            y_train_hat = clf.predict_proba(X_train)
            train = pd.DataFrame()
            train['fraud_label'] = y_train.values
            train['fraud_proba'] = y_train_hat[:,1]
            train_fdr = FDR(train)

            y_test_hat = clf.predict_proba(X_test)
            test = pd.DataFrame()
            test['fraud_label'] = y_test.values
            test['fraud_proba'] = y_test_hat[:,1]
            test_fdr = FDR(test)

            y_oot_hat = clf.predict_proba(x_oot)
            oot = pd.DataFrame()
            oot['fraud_label'] = y_oot.values
            oot['fraud_proba'] = y_oot_hat[:,1]
            oot_fdr = FDR(oot)

            dict_ada = dict(test_size = size, num_of_tree = n, learning_rate = l,\
                            TRAIN = train_fdr, TEST = test_fdr, OOT = oot_fdr)
            result_ada.append(dict_ada)

In [186]:
result_ada = pd.DataFrame(result_ada)

In [187]:
result_ada

Unnamed: 0,test_size,num_of_tree,learning_rate,TRAIN,TEST,OOT
0,0.3,500,0.01,0.560481,0.573285,0.547548
1,0.3,500,0.1,0.563818,0.576327,0.551386
2,0.3,500,1.0,0.563699,0.576327,0.550107
3,0.3,700,0.01,0.56215,0.573285,0.550959
4,0.3,700,0.1,0.563818,0.576881,0.550533
5,0.3,700,1.0,0.564891,0.575221,0.54968
6,0.3,900,0.01,0.562984,0.574392,0.550959
7,0.3,900,0.1,0.563699,0.576604,0.550533
8,0.3,900,1.0,0.564176,0.576881,0.54968
9,0.4,500,0.01,0.559938,0.567929,0.547122


In [189]:
#result_ada.to_csv('result_ada.csv')