In [1]:
# Importing necessary files

import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.preprocessing import LabelEncoder

In [2]:
# All the paths to be used in the notebook

transactions_path = './transactions.csv'
fraudsters_path = './fraudsters.csv'
users_path = './users.csv'
new_fraud_file_path = './updated_fraudsters.csv'

In [3]:
# Read all data

transactions_df = pd.read_csv(transactions_path)
fraudsters_df = pd.read_csv(fraudsters_path)
users_df = pd.read_csv(users_path)

In [4]:
# Encode labels for categorical data -> Assigns numbers to categories

le = LabelEncoder()
transactions_df['CURRENCY'] = le.fit_transform(transactions_df['CURRENCY'].astype('str'))
transactions_df['STATE'] = le.fit_transform(transactions_df['STATE'].astype('str'))
transactions_df['MERCHANT_CATEGORY'] = le.fit_transform(transactions_df['MERCHANT_CATEGORY'].astype('str'))
transactions_df['ENTRY_METHOD'] = le.fit_transform(transactions_df['ENTRY_METHOD'].astype('str'))
transactions_df['TYPE'] = le.fit_transform(transactions_df['TYPE'].astype('str'))
transactions_df['SOURCE'] = le.fit_transform(transactions_df['SOURCE'].astype('str'))
transactions_df['MERCHANT_COUNTRY'] = le.fit_transform(transactions_df['MERCHANT_COUNTRY'].astype('str'))

In [5]:
# Extract more features from dates
"""
1. Transaction's time during the day (create a relation between time of day and fraud i.e. 
                                    fraud could happen more often around midnight)
2. Transaction's day during the week (frequent during the week, at the start or over the weeknd)
3. Transaction's month (time of year where fraud might be frequent)
"""
from datetime import datetime
#date = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S:%f")
month = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f" ).month
week_day = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f" ).weekday()
times_of_day = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7 ]
time_of_day = lambda x: times_of_day[datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f").hour]

transactions_df['month'] = transactions_df['CREATED_DATE'].map(month)
transactions_df['day_of_week'] = transactions_df['CREATED_DATE'].map(week_day)
transactions_df['time_of_day'] = transactions_df['CREATED_DATE'].map(month)

In [6]:
# Merge fraudsters data with the transactions they have done to derive a pattern for each transaction
fraud_transactions = pd.merge(fraudsters_df, transactions_df, left_on = 'user_id', right_on = 'USER_ID', how='left')

In [7]:
fraud_transactions.head()

Unnamed: 0,Unnamed: 0_x,user_id,Unnamed: 0_y,CURRENCY,AMOUNT,STATE,CREATED_DATE,MERCHANT_CATEGORY,MERCHANT_COUNTRY,ENTRY_METHOD,USER_ID,TYPE,SOURCE,ID,month,day_of_week,time_of_day
0,0,5270b0f4-2e4a-4ec9-8648-2135312ac1c4,,,,,,,,,,,,,,,
1,1,848fc1b1-096c-40f7-b04a-1399c469e421,599236.0,9.0,59700.0,1.0,2018-06-29 12:34:41.413000,73.0,362.0,5.0,848fc1b1-096c-40f7-b04a-1399c469e421,4.0,8.0,062feaed-978a-4e04-b167-accf58d4f51e,6.0,4.0,6.0
2,1,848fc1b1-096c-40f7-b04a-1399c469e421,600652.0,9.0,10000.0,1.0,2018-06-29 16:22:12.830000,73.0,362.0,5.0,848fc1b1-096c-40f7-b04a-1399c469e421,1.0,8.0,0baebd76-a069-46a0-b566-2b3c98779415,6.0,4.0,6.0
3,1,848fc1b1-096c-40f7-b04a-1399c469e421,608847.0,9.0,10000.0,1.0,2018-07-01 13:56:12.490000,73.0,362.0,5.0,848fc1b1-096c-40f7-b04a-1399c469e421,1.0,2.0,a40d8b2a-8f54-4422-b8e3-9d6fc485b6a6,7.0,6.0,7.0
4,1,848fc1b1-096c-40f7-b04a-1399c469e421,609296.0,9.0,10000.0,1.0,2018-07-02 20:02:34.118000,73.0,362.0,5.0,848fc1b1-096c-40f7-b04a-1399c469e421,1.0,8.0,cb6d9f7c-f53b-40c7-9cde-e9925bb2f704,7.0,0.0,7.0


In [8]:
# After feature engineering, extract the columns needed for training

X_train = fraud_transactions[['CURRENCY', 'AMOUNT', 'STATE', 'MERCHANT_CATEGORY', 'MERCHANT_COUNTRY', 'ENTRY_METHOD', \
                             'TYPE', 'SOURCE', 'month', 'day_of_week', 'time_of_day']]

In [9]:
X_train.shape

(14544, 11)

In [10]:
X_train = X_train.dropna()

In [11]:
# Since this is one class classification, OneClassSVM is perfect for this situation, derives a pattern for fraud transactions
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X_train)

OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.1, kernel='rbf',
      max_iter=-1, nu=0.1, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [12]:
y_train = clf.predict(X_train)

In [13]:
np.array(np.unique(y_train, return_counts=True)).T

array([[   -1,  4226],
       [    1, 10317]])

In [14]:
X_test = transactions_df[['CURRENCY', 'AMOUNT', 'STATE', 'MERCHANT_CATEGORY', 'MERCHANT_COUNTRY', 'ENTRY_METHOD', \
         'TYPE', 'SOURCE', 'month', 'day_of_week', 'time_of_day']]

In [15]:
y_test = clf.predict(X_test)

In [16]:
all_users = transactions_df['USER_ID']

In [18]:
# Save all the users whose transactions were tagged by the SVM as fraudster
new_fraud_users = []
for i, user in enumerate(all_users):
    if y_test[i] == 1:
        new_fraud_users.append(user)

In [19]:
new_fraud_users = list(set(new_fraud_users))

In [20]:
len(new_fraud_users)

6139

In [21]:
new_fraud_users_df = pd.DataFrame(new_fraud_users)

In [22]:
# Save the data to a csv file 
new_fraud_users_df.to_csv(new_fraud_file_path)