In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import calendar
from timer import timer
import datetime as dt
import scipy.stats as sps
import random

In [40]:
data = pd.read_csv('all_candidate_variabels.csv')

In [41]:
data = data.drop(columns = ['Unnamed: 0'])

**Select by Filters**

In [42]:
random_num = []
for i in range(data.shape[0]):
    random_num.append(random.uniform(0,1))

In [43]:
data['RANDOM'] = random_num

In [44]:
#train_test excludes last two months and first two weeks
df = data[(data.date > '2016-01-14') & (data.date < '2016-11-01')]

In [45]:
df = df.iloc[:,np.r_[9,11,13,33:384]]

In [46]:
goods = df.loc[df['fraud_label'] == 0]
bads = df.loc[df['fraud_label'] == 1]

In [47]:
KSFDR = pd.DataFrame({'Variable number': [i for i in range(354)], 'Variable': df.columns})

In [48]:
KSFDR['KS'] = np.nan

**Calculate Univariate KS**

In [49]:
i = 0
for column in df:
    KSFDR['KS'][i] = sps.ks_2samp(goods[column],bads[column])[0]
    i = i+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


**Create Univariate FDR@3%**

In [50]:
topRows = int(round(len(df)*0.03))
numbads = len(bads)
listvars = list(df.columns)
listvars.remove('fraud_label')

In [51]:
j = 1
for column in listvars:
    temp = df[[column,'fraud_label']].copy()
    temp0 = temp.sort_values(column,ascending=False)
    temp1 = temp0.head(topRows)
    temp2 = temp0.tail(topRows)
    needed1 = temp1.loc[:,'fraud_label']
    needed2 = temp2.loc[:,'fraud_label']
    FDR1 = sum(needed1)/numbads
    FDR2 = sum(needed2)/numbads
    FDRate = np.maximum(FDR1,FDR2) 
    KSFDR.loc[j, 'FDR at 3%'] = FDRate
    
    j = j + 1

In [52]:
KSFDR = KSFDR.fillna(1)

In [53]:
sort_ks = KSFDR.sort_values(by = 'KS', ascending = False)

In [54]:
sort_ks.head(10)

Unnamed: 0,Variable number,Variable,KS,FDR at 3%
0,0,fraud_label,1.0,1.0
10,10,address_day_since,0.334096,0.355302
38,38,fulladdress_day_since,0.33321,0.358349
16,16,address_count_30,0.332725,0.3533
44,44,fulladdress_count_30,0.332032,0.354954
15,15,address_count_14,0.322252,0.345812
43,43,fulladdress_count_14,0.321756,0.34233
14,14,address_count_7,0.301445,0.320999
42,42,fulladdress_count_7,0.301368,0.319955
341,341,num_of_ssn_for_each_fulladdress,0.30019,0.320564


In [55]:
sort_fdr = KSFDR.sort_values(by = 'FDR at 3%', ascending = False)

In [56]:
sort_fdr.head(10)

Unnamed: 0,Variable number,Variable,KS,FDR at 3%
0,0,fraud_label,1.0,1.0
38,38,fulladdress_day_since,0.33321,0.358349
10,10,address_day_since,0.334096,0.355302
44,44,fulladdress_count_30,0.332032,0.354954
16,16,address_count_30,0.332725,0.3533
15,15,address_count_14,0.322252,0.345812
43,43,fulladdress_count_14,0.321756,0.34233
14,14,address_count_7,0.301445,0.320999
341,341,num_of_ssn_for_each_fulladdress,0.30019,0.320564
42,42,fulladdress_count_7,0.301368,0.319955


In [57]:
#sort_ks.to_csv('vars_ksfdr.csv')

In [58]:
#sort_fdr.to_csv('fdr.csv')

In [59]:
KSFDR['rank_ks'] = KSFDR['KS'].rank(ascending = True)
KSFDR['rank_FDR'] = KSFDR['FDR at 3%'].rank(ascending = True)
KSFDR['average_rank'] = (KSFDR['rank_ks'] + KSFDR['rank_FDR']) / 2
KSFDR.sort_values(by=['average_rank'], ascending=False, inplace=True)

In [60]:
vars_filter = KSFDR[:80]

In [61]:
vars_filter

Unnamed: 0,Variable number,Variable,KS,FDR at 3%,rank_ks,rank_FDR,average_rank
0,0,fraud_label,1.000000,1.000000,354.0,354.0,354.00
38,38,fulladdress_day_since,0.333210,0.358349,352.0,353.0,352.50
10,10,address_day_since,0.334096,0.355302,353.0,352.0,352.50
44,44,fulladdress_count_30,0.332032,0.354954,350.0,351.0,350.50
16,16,address_count_30,0.332725,0.353300,351.0,350.0,350.50
...,...,...,...,...,...,...,...
263,263,ssn_firstname_count_0_by_14,0.193164,0.218353,277.0,278.5,277.75
271,271,ssn_lastname_count_0_by_14,0.192824,0.218614,273.0,282.5,277.75
98,98,ssn_firstname_count_7,0.192673,0.218440,272.0,280.0,276.00
327,327,ssn_name_dob_count_0_by_14,0.192902,0.218179,274.0,277.0,275.50


**Select by Wrapper**

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

In [69]:
Y = df['fraud_label']
data = df[list(vars_filter['Variable'].values)].drop(columns = ['fraud_label'])

In [72]:
model = LogisticRegression(penalty='l2', class_weight='balanced')
rfecv = RFECV(estimator=model, step=1, cv=2, verbose=2, n_jobs=1, scoring="roc_auc")
rfecv.fit(data, Y)

Fitting estimator with 79 features.
Fitting estimator with 78 features.
Fitting estimator with 77 features.
Fitting estimator with 76 features.
Fitting estimator with 75 features.
Fitting estimator with 74 features.
Fitting estimator with 73 features.
Fitting estimator with 72 features.
Fitting estimator with 71 features.
Fitting estimator with 70 features.
Fitting estimator with 69 features.
Fitting estimator with 68 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 fe

RFECV(cv=2,
      estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                                   fit_intercept=True, intercept_scaling=1,
                                   l1_ratio=None, max_iter=100,
                                   multi_class='auto', n_jobs=None,
                                   penalty='l2', random_state=None,
                                   solver='lbfgs', tol=0.0001, verbose=0,
                                   warm_start=False),
      min_features_to_select=1, n_jobs=1, scoring='roc_auc', step=1, verbose=2)

In [73]:
vars_wrapper = pd.DataFrame(sorted(zip(map(lambda x: round(x), rfecv.ranking_), data.columns)),
                            columns = ['ranking', 'variable'])

In [74]:
vars_wrapper[vars_wrapper['ranking'] ==1]

Unnamed: 0,ranking,variable
0,1,address_count_0_by_3
1,1,address_count_0_by_7
2,1,address_count_1
3,1,address_count_14
4,1,address_count_3
5,1,address_count_30
6,1,address_count_7
7,1,fulladdress_count_0_by_3
8,1,fulladdress_count_0_by_30
9,1,fulladdress_count_0_by_7


In [28]:
vars_wrapper2 = vars_wrapper[:50]

In [29]:
data = df[list(vars_wrapper2['variable'].values)]

In [30]:
model = DecisionTreeClassifier(max_depth = 6, class_weight = 'balanced')
rfecv = RFECV(estimator=model, step=1, cv=2, verbose=2, n_jobs=1, scoring="roc_auc")
rfecv.fit(data, Y)

Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 fe

RFECV(cv=2,
      estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced',
                                       criterion='gini', max_depth=6,
                                       max_features=None, max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       presort='deprecated', random_state=None,
                                       splitter='best'),
      min_features_to_select=1, n_jobs=1, scoring='roc_auc', step=1, verbose=2)

In [31]:
vars_final = pd.DataFrame(sorted(zip(map(lambda x: round(x), rfecv.ranking_), data.columns)),
                            columns = ['ranking', 'variable'])

In [32]:
vars_final = vars_final[:30]

In [33]:
#vars_final.to_csv('final_vars.csv')

In [34]:
vars_final

Unnamed: 0,ranking,variable
0,1,fulladdress_count_30
1,1,homephone_count_7
2,1,num_of_fulladdress_for_each_ssn
3,1,num_of_ssn_for_each_fulladdress
4,2,ssn_dob_day_since
5,3,fulladdress_count_7
6,4,num_of_name_dob_for_each_fulladdress_homephone
7,5,num_of_homephone_for_each_name_dob
8,6,ssn_name_dob_day_since
9,7,fulladdress_homephone_count_30
