# Loading Dataset

In [2]:
import numpy as np 
import pandas as pd 

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
%%time
train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
print ("Data is loaded!")

FileNotFoundError: [Errno 2] No such file or directory: 'train_transaction.csv'

In [5]:
train_transaction.head()

NameError: name 'train_transaction' is not defined

In [22]:
len(np.unique(train_transaction['card1']))

13553

In [25]:
train_transaction.shape

(590540, 393)

In [27]:
# avg. amount of transactions per card

train_transaction.shape[0] / len(np.unique(train_transaction['card1']))

43.572640743746774

In [7]:
train_identity.head()

Unnamed: 0_level_0,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.0,70787.0,,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
2987008,-5.0,98945.0,,,0.0,-5.0,,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
2987011,-5.0,221832.0,,,0.0,-6.0,,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [28]:
# Merge Datasets on TransactionID

train_df = pd.merge(train_transaction,train_identity,how="left",on="TransactionID")
test_df = pd.merge(test_transaction,test_identity,how="left",on="TransactionID")

# Modeling

In [29]:
from sklearn import preprocessing
import warnings
warnings.simplefilter('ignore')
from sklearn.linear_model import LogisticRegression

In [31]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV

from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [32]:
train_num_df = train_df.select_dtypes(include=['float64', 'int64'])

In [33]:
# creating target and feature variables 

y_train = train_df['isFraud']

# select feature by type
X_train = train_num_df.drop('isFraud', axis=1)

In [34]:
y_train.unique()

array([0, 1], dtype=int64)

## Create smaller dataset for investigation 

In [35]:
# Create a smaller dataset for investigation purpose only
sample_size = 20000

X_small = X_train.sample(sample_size, random_state=0)
y_small = y_train.sample(sample_size, random_state=0)

## Simple Model (first iteration)

In [36]:
# create Basemodel: SGDClassifier Logistic Regression 

base_model = SGDClassifier(loss='log', alpha=0.5, class_weight='balanced')
cv_results_base_model = cross_validate(base_model, X_small, y_small, cv=5, n_jobs=1, scoring=['recall', 'f1_macro'])
cv_results_base_model['test_f1_macro'].mean()

nan

In [41]:
# Logistic Regression Model 

log_model = LogisticRegression(class_weight='balanced')
cv_results_log_model = cross_val_score(log_model, X_small, y_small, cv=5, scoring='recall')
cv_results_log_model

array([nan, nan, nan, nan, nan])

## Feature Permutation 

In [None]:
# Base Model 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Define X and y
X = train_df.drop('isFraud')
y = train_df['isFraud']

# split dataset 

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=2)

log_model = LogisticRegression()

log_model.fit(X_train,y_train)

log_model.score(X_test,y_test)


# Permutation 

from sklearn.inspection import permutation_importance

permutation_score = permutation_importance(log_model, X_train, y_train, n_repeats=100)

np.vstack((X.columns, permutation_score.importances_mean)).T

## Feature Engineering 

### New feature: day of the week 

In [None]:
# create new feature day of the week, encoded as 0-6 
# found a good offset is 0.58

def make_day_feature(df, offset=0, column_name='TransactionDT'):

    days = df[tname] / (3600*24)        
    encoded_days = np.floor(days-1+offset) % 7
    return encoded_days



### New feature: hour of the day

In [None]:
# create new feature hour of the day, encoded as 0-23

def make_hour_feature(df, column_name='TransactionDT'):

    hours = df[column_name] / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

In [None]:
# visualize periodicity: number of transactions per time interval 

vals = plt.hist(train['TransactionDT'] / (3600*24), bins=1800)
plt.xlim(70, 78)
plt.xlabel('Days')
plt.ylabel('Number of transactions')
plt.ylim(0,1000)

In [None]:
# create new feature weekday
# offset to define start of the day: 0.85

train_df['weekday'] = make_day_feature(train_df, offset=0.58)

plt.plot(train_df.groupby('weekday').mean()['isFraud'])

plt.ylim(0, 0.04)
plt.xlabel('Encoded day')
plt.ylabel('Fraction of fraudulent transactions')

# outputs fraction of fraudulent transactions per weekday 

In [None]:
# create a feature which encodes the (relative) hour of the day

train_df['hours'] = make_hour_feature(train_df)

plt.plot(train.groupby('hours').mean()['isFraud'], color='k')

ax = plt.gca()
ax2 = ax.twinx()
_ = ax2.hist(train['hours'], alpha=0.3, bins=24)
ax.set_xlabel('Encoded hour')
ax.set_ylabel('Fraction of fraudulent transactions')

ax2.set_ylabel('Number of transactions')

In [None]:
# https://thispointer.com/pandas-find-duplicate-rows-in-a-dataframe-based-on-all-or-selected-columns-using-dataframe-duplicated-in-python/#:~:text=To%20find%20%26%20select%20the%20duplicate,argument%20is%20'first').
# Select all duplicate rows based on multiple column names in list
# duplicateRowsDF = dfObj[dfObj.duplicated(['Age', 'City'])]


## Advanced Models 

In [None]:
models = []

models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', SVC()))
models.append(('XGB', XGBClassifier()))
models.append(('RF', RandomForestClassifier()))

#testing models

results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=42)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='roc_auc')
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# Simple Decision Tree 

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(class_weight='balanced')
cv_result_dt_model = cross_val_score(dt_model, X_train, y_train, cv=3, scoring=['recall', 'f1_macro'])

### Random Forest Classifier 

In [None]:
# Simple Random Forest 

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(class_weight='balanced', random_state=0)
cv_results_forest = cross_validate(forest, X_train, y_train, cv=5, scoring='f1_macro')
print(cv_results_forest['test_score'].mean())

In [None]:
forest.fit(X_train, y_train)

In [None]:
# Do we overfit on our train set?
f1_score(forest.predict(X_train_small), y_train_small, average='macro')

In [None]:
print(classification_report(forest.predict(X_test_small), y_test_small))

Hyperparameter Tuning 

In [None]:
# Wide RandomSearch

model = RandomForestClassifier(class_weight='balanced')

search_space = {'n_estimators': [int(x) for x in np.linspace(50, 1000, num=20)],
                'max_depth': [int(x) for x in np.linspace(10, 100, num=10)] + [None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'criterion': ['gini'],
                'bootstrap': [True, False]
                }

cv_model = RandomizedSearchCV(model,
                              scoring='f1_macro',
                              param_distributions=search_space,
                              n_jobs=-1,
                              cv=3,
                              n_iter=30,
                              verbose=1)

search = cv_model.fit(X_train_small, y_train_small)

In [None]:
search.best_score_

In [None]:
search.best_params_

In [None]:
best_forest = RandomForestClassifier(**{'n_estimators': 300, 'max_depth': 15, 'bootstrap': False})
cross_validate(best_forest, X_train_small, y_train_small, cv=5, scoring='f1_macro')['test_score'].mean()

In [None]:
# Final Evaluation!
print(classification_report(best_forest.fit(X_train_small, y_train_small).predict(X_test_small), y_test_small))

### Naive Bayes