In [144]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
import warnings
warnings.filterwarnings('ignore')

In [145]:
df= pd.read_csv('final_selected_variables.csv')

In [146]:
df1 = df[(df.Recnum <=84299) & (df.Recnum >= 3338)]
df2 = df[df.Recnum >= 84300]
df1 = df1.set_index('Recnum')
df2 = df2.set_index('Recnum')
X = df1.drop(['Fraud', 'Unnamed: 0'], axis = 1)
Y = df1['Fraud']
x_oot = df2.drop(['Fraud', 'Unnamed: 0'], axis = 1)
y_oot = df2['Fraud']
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3)

In [147]:
x_train.to_csv('train_x.csv',index=True, header=True)
x_test.to_csv('test_x.csv',index=True, header=True)
x_oot.to_csv('oot_x.csv',index=True, header=True)
y_train.to_csv('train_y.csv',index=True, header=True)
y_test.to_csv('test_y.csv',index=True, header=True)
y_oot.to_csv('oot_y.csv',index=True, header=True)

### Logistic Regression

In [148]:
lr = LogisticRegression(solver = 'liblinear',max_iter = 1000)

In [149]:
lr.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [150]:
y_train_pred = lr.predict_proba(x_train)[:,1]

In [151]:
y_train_pred

array([0.00279667, 0.00195871, 0.00354005, ..., 0.0051212 , 0.00068419,
       0.00266347])

In [152]:
def FDR(x,y,model):
    pred = model.predict_proba(x)[:,1]
    y_df = pd.DataFrame(y)
    y_df['pred'] = pred
    top = int(len(y_df) * 0.03)
    numbads = sum(y_df['Fraud'] == 1)
    fdr = y_df.sort_values(by = 'pred', ascending = False).head(top).Fraud.sum()/numbads
    return fdr

In [153]:
FDR(x_train,y_train,lr)

0.6671974522292994

In [155]:
%%time
lr_FDR = []

solver=['newton-cg','lbfgs','liblinear','sag','saga',]

for s in solver:
    x_train = pd.read_csv('train_x.csv')
    x_train = x_train.set_index('Recnum')
    y_train = pd.read_csv('train_y.csv')
    y_train = y_train.set_index('Recnum')
    x_test = pd.read_csv('test_x.csv')
    x_test = x_test.set_index('Recnum')
    y_test = pd.read_csv('test_y.csv')
    y_test = y_test.set_index('Recnum')
    x_oot = pd.read_csv('oot_x.csv')
    x_oot = x_oot.set_index('Recnum')
    y_oot = pd.read_csv('oot_y.csv')
    y_oot = y_oot.set_index('Recnum')

    lr = LogisticRegression(solver = s)
    lr.fit(x_train, y_train)
    train_pred = lr.predict_proba(x_train)[:,1]
    test_pred = lr.predict_proba(x_test)[:,1]
    oot_pred = lr.predict_proba(x_oot)[:,1]
            
    # FDR train
    train_data = x_train
    train_data['Fraud'] = y_train
    train_data['pred'] = train_pred
    topRows = int(round(len(train_data)*0.03)) 
    temp = train_data.sort_values('pred', ascending = False).head(topRows)
    needed = temp.loc[:,'Fraud']
    FDR_train = sum(needed)/sum(train_data['Fraud'])

    # FDR test
    test_data = x_test
    test_data['Fraud'] = y_test
    test_data['pred'] = test_pred
    topRows = int(round(len(test_data)*0.03)) 
    temp = test_data.sort_values('pred', ascending = False).head(topRows)
    needed = temp.loc[:,'Fraud']
    FDR_test = sum(needed)/sum(test_data['Fraud'])
            
    # FDR oot
    oot_data = x_oot
    oot_data['Fraud'] = y_oot
    oot_data['pred'] = oot_pred
    topRows = int(round(len(oot_data)*0.03)) 
    temp = oot_data.sort_values('pred', ascending = False).head(topRows)
    needed = temp.loc[:,'Fraud']
    FDR_oot = sum(needed)/sum(oot_data['Fraud'])

    lr_FDR.append([s, FDR_train, FDR_test, FDR_oot])

CPU times: user 9.56 s, sys: 493 ms, total: 10.1 s
Wall time: 7.56 s


In [156]:
lr_FDR = pd.DataFrame(lr_FDR, columns = ['solver', 'train', 'test', 'oot'])
lr_FDR

Unnamed: 0,solver,train,test,oot
0,newton-cg,0.667197,0.6875,0.346369
1,lbfgs,0.667197,0.6875,0.351955
2,liblinear,0.667197,0.6875,0.346369
3,sag,0.665605,0.6875,0.351955
4,saga,0.664013,0.6875,0.351955


In [142]:
lr_FDR.to_excel('lr_FDR2.xlsx', index=True, header=True)