In [1]:
#import packages
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
tips = sns.load_dataset("tips")
palette = sns.color_palette()
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from itertools import cycle
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix
from scipy import interp
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from scipy.stats import ttest_ind
from bisect import bisect

In [51]:
#import fraud data
url = 'Fraud_Data.csv'
df = pd.read_csv(url)
nr_datarows, nr_datacols = df.shape
print 'nr_datarows = %s, nr_datacols = %s' % (nr_datarows, nr_datacols)
#import ip address to country data
url = 'IpAddress_to_Country.csv'
ipdata = pd.read_csv(url)
ipdata = ipdata.as_matrix()
nr_ipdatarows = ipdata.shape[0]
print 'nr_ipdatarows = %s' % nr_ipdatarows
ip_low = ipdata[:,0].astype(float)
ip_hi = ipdata[:,1].astype(float)
ip_low = np.r_[ip_low, ip_hi[-1]]
ip_country = ipdata[:,2].astype(str)
# check sorted
print 'It is %r that ipdata is sorted' % all(ip_low[i] < ip_low[i+1] and ip_hi[i] < ip_hi[i+1] for i in range(nr_ipdatarows - 1))
print 'It is %r that user_id differs for every row' % (len(df['user_id']) == len(set(df['user_id'])))
df = df.drop('user_id', axis=1)
set_devices = set(df['device_id'])
print 'nr_devices = %r' % len(set_devices)
set_ip = set(df['ip_address'])
print 'nr_ip = %r' % len(set_ip)
print 'nr_non_fraud = %r' % sum(df['class'] == 0)
print 'nr_fraud = %r' % sum(df['class'])
print 'P(fraud) = %r' % (float(sum(df['class']))/nr_datarows)

# add country to table
country = []
for ip in df['ip_address']:
    ip_row = bisect(ip_low, ip) - 1
    if ip_row == nr_ipdatarows or ip > ip_hi[ip_row]:
        country.append('None')
    else:
        country.append(ip_country[ip_row])
df['country'] = country

#create new features
#time since signup
df['time'] = (pd.to_datetime(df['purchase_time'])-pd.to_datetime(df['signup_time']))/np.timedelta64(1, 'h')
df = df.drop('purchase_time', axis=1)
df = df.drop('signup_time', axis=1)

# nr_users per device
nr_device_users = {}
for index, row in df.iterrows():
    dev_id = row['device_id']
    if dev_id not in nr_device_users:
        nr_device_users[dev_id] = 0
    nr_device_users[dev_id] += 1
nr_users = []
for dev_id in df['device_id']:
    nr_users.append(nr_device_users[dev_id])
df['nr_users'] = nr_users
nr_device_users.clear()

# drop variables not used for classification
df = df.drop('device_id', axis=1)
df = df.drop('source', axis=1)
df = df.drop('browser', axis=1)
df = df.drop('ip_address', axis=1)

# change categorical to integer
df['sex'] = df['sex'].astype('category')
df['sex'] = df['sex'].cat.codes
df['country'] = df['country'].astype('category')
df['country'] = df['country'].cat.codes

print df.dtypes

df.head()

nr_datarows = 151112, nr_datacols = 11
nr_ipdatarows = 138846
It is True that ipdata is sorted
It is True that user_id differs for every row
nr_devices = 137956
nr_ip = 143512
nr_non_fraud = 136961
nr_fraud = 14151
P(fraud) = 0.09364577267192546


Unnamed: 0,purchase_value,sex,age,class,country,time,nr_users
0,34,1,39,0,84,1251.856111,1
1,16,0,53,0,172,4.984444,1
2,15,1,53,1,172,0.000278,12
3,44,1,41,0,124,136.690278,1
4,39,1,45,0,172,1211.516944,1


In [57]:
df.groupby('class').mean()

Unnamed: 0_level_0,purchase_value,sex,age,country,time,nr_users
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,36.929418,0.583078,33.122356,122.1448,1441.994052,1.120071
1,36.993004,0.596,33.318281,122.557628,673.289542,7.145926


<b> 
Split Data 
</b>

I randomly split data into independent training and test sets. The split was 70/30 between the training and test sets.

In [None]:
Y = df['class']
X = df.drop(['class'], axis=1)
X = X.as_matrix()

#split data on unique ids
rs = ShuffleSplit(n_splits=2, test_size=.30, random_state=0)
for train_index, test_index in rs.split(X):
            X_train, X_test = X[train_index,:], X[test_index,:]
            Y_train, Y_test = Y[train_index], Y[test_index]


<b>
Model selection
</b>

I built two models: logistic regression + lasso and random forest. I changed the class weights to account for the imbalanced data set.

The area under the curve for logistic regression + lasso (AUC = 0.77) is higher than that of random forest (AUC = 0.77). The two models perform similarly.

<br>
<b>
With more time, I would do the following to improve model performance.
</b>

1) Optimize parameters <br><br>
2) Feature selection for random forest <br><br>
3) Test more models <br><br>

In [None]:
#Logistic Regression + Lasso 
clf = linear_model.LogisticRegression(C=1, penalty='l1', tol=1e-6, class_weight='balanced')
tmp = clf.fit(X_train, Y_train)
clf_y_score = clf.fit(X_train, Y_train).predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)
auc = roc_auc_score(Y_test, y_pred)
cm = confusion_matrix(Y_test, y_pred)
print("Logistic Regression + Lasso")
print("AUC: "+str(auc))
print("\nConfusion Matrix: ")
print(cm)


In [None]:
#Random Forest
rf = RandomForestClassifier(class_weight='balanced')
tmp = rf.fit(X_train, Y_train)
rf_y_score = rf.fit(X_train, Y_train).predict_proba(X_test)[:, 1]
y_pred = rf.predict(X_test)
auc = roc_auc_score(Y_test, y_pred)
cm = confusion_matrix(Y_test, y_pred)
print("Random Forest")
print("AUC: "+str(auc))
print("\nConfusion Matrix: ")
print(cm)


<b>Feature Importance </b>

The most important variable for predicting fraud is the source. It may be that some sources are easier to commit fraud through. It may be useful to have different thresholds for declaring fraud for different sources.
<br><br>
The second most important variables for predicting fraud are the number of transactions from the same ip address (with different user ids). This is also correlated with device id (with different user ids). The average number of these transactions for those flagged as fradulent is significant higher than that of the non-flagged grouped according to a t-test with a p-value of less than 0.05. 


In [None]:
#get variable importance from random forest
importance = [abs(x) for x in clf.coef_[0]]
importance = list(zip(list(df.columns[1:]),importance))
importance = sorted(importance, key=lambda x: x[1])
importance = importance[::-1]
print('Importance:')
for imp in importance:
    print(str(imp[0])+' = '+str(imp[1]))

#test significance
dfsplit = [rows for _, rows in df.groupby('class')]
t, p = ttest_ind(dfsplit[0]['ip_count'], dfsplit[1]['ip_count'], equal_var=False)
print('\n\n==========================\n\n')
print("T-Test:")
print("p: "+str(p))
print("\n\n")

#plot most important variables
plt.figure()
sns.barplot(x="class", y="ip_count", data=df)
sns.plt.title('Transaction Frequency from Same IP Address')
plt.show()
plt.figure()


In [None]:
#plot ROC curve
clf_fpr, clf_tpr, clf_auc = roc_curve(Y_test, clf_y_score)
rf_fpr, rf_tpr, rf_auc = roc_curve(Y_test, rf_y_score)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(clf_fpr, clf_tpr, 'r', label='Logistic + Lasso')
plt.plot(rf_fpr, rf_tpr, 'b', label='Random Forest')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()