In [1]:
import pandas as pd

### Data Loading

In [2]:
dfList = []
for i in range(1,32):
    print(".",end='', flush=True)
    dfList.append(pd.read_json("./data/fraud-data/2017-01-{0:02d}.txt".format(i), lines=True))
print("done!")
df = pd.concat(dfList)

...............................done!


In [3]:
print(df.fraudLabel.value_counts())
df.head()

0    30069
1      931
Name: fraudLabel, dtype: int64


Unnamed: 0,basket,fraudLabel,totalAmount,transactionId,zipCode
0,"[0, 0, 0, 3]",0,44,2809246745,2835
1,"[3, 2, 1, 1, 1]",0,290,8092680577,5147
2,"[1, 1]",0,92,7628460010,1570
3,"[3, 3, 5, 3, 0]",0,480,6041993990,2259
4,"[4, 1, 1, 4]",0,248,6572762316,1962


### Feature Engineering

In [4]:
df['c_0'] = df.basket.map(lambda x: x.count(0))
df['c_1'] = df.basket.map(lambda x: x.count(1))
df['c_2'] = df.basket.map(lambda x: x.count(2))
df['c_3'] = df.basket.map(lambda x: x.count(3))
df['c_4'] = df.basket.map(lambda x: x.count(4))
df['c_5'] = df.basket.map(lambda x: x.count(5))

In [5]:
df.head()

Unnamed: 0,basket,fraudLabel,totalAmount,transactionId,zipCode,c_0,c_1,c_2,c_3,c_4,c_5
0,"[0, 0, 0, 3]",0,44,2809246745,2835,3,0,0,1,0,0
1,"[3, 2, 1, 1, 1]",0,290,8092680577,5147,0,3,1,1,0,0
2,"[1, 1]",0,92,7628460010,1570,0,2,0,0,0,0
3,"[3, 3, 5, 3, 0]",0,480,6041993990,2259,1,0,0,3,0,1
4,"[4, 1, 1, 4]",0,248,6572762316,1962,0,2,0,0,2,0


In [6]:
df["zipCode"] = df["zipCode"].astype('category',categories=list(range(1000,10000)))
dummies = pd.get_dummies(df.zipCode)

  if __name__ == '__main__':


In [7]:
dummies.shape

(31000, 9000)

In [8]:
df2 = pd.concat([df, dummies], axis=1)

In [9]:
df3 = df2.drop(["basket", "zipCode", "transactionId"], axis=1)

In [10]:
df3.head()

Unnamed: 0,fraudLabel,totalAmount,c_0,c_1,c_2,c_3,c_4,c_5,1000,1001,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0,44,3,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,290,0,3,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,92,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,480,1,0,0,3,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,248,0,2,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model training

In [11]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df3, test_size=0.3, random_state=0)
# save data to csv
# train.to_pickle("./data/train.pickle")
# test.to_pickle("./data/test.pickle")
train.head()

Unnamed: 0,fraudLabel,totalAmount,c_0,c_1,c_2,c_3,c_4,c_5,1000,1001,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
281,0,204,1,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57,0,200,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
273,0,240,2,0,1,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
725,0,105,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
55,0,115,0,1,0,1,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X_train = train[train.columns[1:]]
y_train = train["fraudLabel"]
X_test = test[test.columns[1:]]
y_test = test["fraudLabel"]
X_train.shape

(21700, 9007)

In [13]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbt = GradientBoostingClassifier()
gbt.fit(X_train, y_train)

### Model evaluation

In [None]:
test_scores_reg = logreg.predict_proba(X_test)
test_scores_gbt = gbt.predict_proba(X_test)
print(test_scores_reg)

In [None]:
test_scores_reg = logreg.predict_proba(X_test)
test_scores_gbt = gbt.predict_proba(X_test)



In [None]:
from sklearn import metrics
fpr_reg, tpr_reg, _ =  metrics.roc_curve(y_test, test_scores_reg[:,1])
auc_reg = metrics.auc(fpr_reg, tpr_reg)
fpr_gbt, tpr_gbt, _ =  metrics.roc_curve(y_test, test_scores_gbt[:,1])
auc_gbt = metrics.auc(fpr_gbt, tpr_gbt)

In [None]:
# save predictions to csv
df = pd.DataFrame(data={'label': y_test, 'predReg': test_scores_reg[:,1], 
                  'predGbt': test_scores_gbt[:,1] })
df.to_csv("./data/predictions.csv", index=False)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr_reg, tpr_reg, color = 'blue', label = "reg:{:.2f}".format(auc_reg))
plt.plot(fpr_gbt, tpr_gbt, color = 'green', label = "gbt:{:.2f}".format(auc_gbt))
plt.legend()
plt.grid()
plt.show()

### Save predictions on train data for modelCalibration

In [None]:
train_scores_reg = logreg.predict_proba(X_train)
train_scores_gbt = gbt.predict_proba(X_train)

In [None]:
# save predictions to csv
df = pd.DataFrame(data={'label': y_train, 'predReg': train_scores_reg[:,1], 
                  'predGbt': train_scores_gbt[:,1] })
df.to_csv("./data/predictions_train.csv", index=False)

In [None]:
>>> from sklearn.externals import joblib
>>> joblib.dump(clf, 'filename.joblib')