In [1]:
import os
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [3]:
#load the data
fraud=pd.read_csv('Fraud_Data.csv')
country_ip=pd.read_csv('IpAddress_to_Country.csv')

In [4]:
#change time format
fraud['purchase_time']=pd.to_datetime(fraud['purchase_time'])
fraud['signup_time']=pd.to_datetime(fraud['signup_time'])

In [5]:
#create a new variable that measures the time between account creation and purchase
fraud['time_to_purchase']=fraud['purchase_time']-fraud['signup_time']
fraud['time_to_purchase']=fraud['time_to_purchase'].dt.seconds

In [6]:
#convert IP addresses to their country of origin
country_id=[]
for i in fraud['ip_address']:
    country_id.append(country_ip['country'][(i>=country_ip['lower_bound_ip_address']) \
                                            & (i<=country_ip['upper_bound_ip_address'])])

In [7]:
#convert Pandas Series from the previous step into list
country_id_final=[]
for i in range(0, len(country_id)):
    if country_id[i].empty==False:
        country_id_final.append(country_id[i].iloc[0])
    else:
        country_id_final.append(np.nan)
fraud['country']=country_id_final

In [8]:
#create lists of the number of times devices and IP addresses were associated with purchases
fraud['device_usage']=fraud['device_id'].map(fraud['device_id'].value_counts())
fraud['ip_usage']=fraud['ip_address'].map(fraud['ip_address'].value_counts())

In [10]:
#choose which columns to use in our dataset
cols_to_use=['purchase_value', 'source', 'browser', 'sex', 'age', 'time_to_purchase', 'country', 'device_usage', 'ip_usage']

In [11]:
#switch to numerical IDs of the values we want to use
for i in cols_to_use:
    if fraud[i].dtype=='object':
        fraud[i+'_codes']=pd.Categorical(fraud[i]).codes
        cols_to_use[cols_to_use.index(i)]=i+'_codes'

In [12]:
#create test and train datasets
train,test=sklearn.model_selection.train_test_split(fraud, test_size=.2, stratify=fraud['class'])

In [13]:
#partition testing and training datasets for modelling
train_X=train[cols_to_use]
train_Y=train['class']
test_X=test[cols_to_use]
test_Y=test['class']

In [14]:
#try a logistic regression
model=LogisticRegression()
fit=model.fit(train_X, train_Y)

In [15]:
lreg_prediction=fit.predict(test_X)

In [16]:
#check the confusion matrix
pd.DataFrame(sklearn.metrics.confusion_matrix(test_Y, lreg_prediction), \
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

Unnamed: 0,Pred -,Pred +
Actual -,27283,110
Actual +,1392,1438


In [17]:
#check logistic regression coefficients
print('Logistic Regression Coefficients')
for i in range(0, len(test_X.keys())): print(fit.coef_[0][i], test_X.keys()[i])

Logistic Regression Coefficients
-0.0126433114369 purchase_value
-0.00681250616596 source_codes
-0.00990322503985 browser_codes
-0.00282149388985 sex_codes
-0.0535014846943 age
-2.09611682956e-05 time_to_purchase
-0.00228060037724 country_codes
0.191836930879 device_usage
0.180113785283 ip_usage


In [18]:
#try a random forest prediction
model=RandomForestClassifier(150, oob_score=True, n_jobs=-1)
fit=model.fit(train_X, train_Y)
RF_prediction=fit.predict(test_X)

In [19]:
#check the confusion matrix again
pd.DataFrame(sklearn.metrics.confusion_matrix(test_Y, RF_prediction), \
             columns=['Pred -', 'Pred +'], index=['Actual -', 'Actual +'])

Unnamed: 0,Pred -,Pred +
Actual -,27349,44
Actual +,1298,1532


In [20]:
#check the Random Forest Coefficients
print('Random Forest Feature Weights')
for i in range(0, len(test_X.keys())): print(fit.feature_importances_[i], test_X.keys()[i])

Random Forest Feature Weights
0.0982602030595 purchase_value
0.0117631246508 source_codes
0.0232715143556 browser_codes
0.0103312459691 sex_codes
0.0814820919614 age
0.392400417877 time_to_purchase
0.0515329735745 country_codes
0.15665205883 device_usage
0.174306369722 ip_usage
