In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import seaborn as sns

In [2]:
from project_fraud.lib import drop_many_missing_values


In [3]:
data = drop_many_missing_values()

In [4]:
data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Convert mail column

In [5]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

for c in ['P_emaildomain']:
    data[c + '_bin'] = data[c].map(emails)
    data[c + '_suffix'] = data[c].map(lambda x: str(x).split('.')[-1])

# New feature: day of the week

In [6]:
# create new feature day of the week, encoded as 0-6 
# found a good offset is 0.58

def make_day_feature(data, offset=0, column_name='TransactionDT'):

    days = data[column_name] / (3600*24)        
    encoded_days = np.floor(days-1+offset) % 7
    return encoded_days

# New feature: hour of the day

In [7]:
# create new feature hour of the day, encoded as 0-23

def make_hour_feature(data, column_name='TransactionDT'):

    hours = data[column_name] / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

# Create new features: weekday and hour of day

In [9]:
# create new feature weekday
# offset to define start of the day: 0.85

data['weekday'] = make_day_feature(data, offset=0.58)

plt.plot(data.groupby('weekday').mean()['isFraud'])

plt.ylim(0, 0.04)
plt.xlabel('Encoded day')
plt.ylabel('Fraction of fraudulent transactions')

outputs fraction of fraudulent transactions per weekday

In [11]:
# create a feature which encodes the (relative) hour of the day

data['hours'] = make_hour_feature(data)

plt.plot(data.groupby('hours').mean()['isFraud'], color='k')

ax = plt.gca()
ax2 = ax.twinx()
_ = ax2.hist(data['hours'], alpha=0.3, bins=24)
ax.set_xlabel('Encoded hour')
ax.set_ylabel('Fraction of fraudulent transactions')

ax2.set_ylabel('Number of transactions')

# New features on each credit card

In [18]:
def function(row):
    if pd.isna(row['card1']):
        return np.nan
    elif pd.isna(row['card2']):
        return np.nan
    elif pd.isna(row['card3']):
        return np.nan
    elif pd.isna(row['card4']):
        return np.nan
    elif pd.isna(row['card5']):
        return np.nan
    elif pd.isna(row['card6']):
        return np.nan
    else: 
        return str(row['card1']) + str(row['card2']) + str(row['card3']) + str(row['card4']) + str(row['card5']) + str(row['card6'])

In [None]:
data['cardID'] = data.apply(lambda row: function(row), axis=1)

In [None]:
credit_cards = data.groupby('cardID').agg(
    mean = pd.NamedAgg(column='TransactionAmt', aggfunc='mean'), 
    min = pd.NamedAgg(column='TransactionAmt', aggfunc='min'), 
    max = pd.NamedAgg(column='TransactionAmt', aggfunc='max'),
    median = pd.NamedAgg(column='TransactionAmt', aggfunc='median'),
)

In [None]:
data = data.merge(credit_cards, how='left', on="cardID")


# Create new features:
- Distance of current transaction from mean of transaction from credit card
- Distance of current transaction from median of transaction from credit card
- Relative distance of current transaction from mean of transaction from credit card
- Relative distance of current transaction from median of transaction from credit card

In [None]:
def dist_from_mean(row, metric): 
    if pd.isna(row['TransactionAmt']):
        return np.nan
    if pd.isna(row[metric]):
        return np.nan
    else: 
        dist = row['TransactionAmt'] - row[metric]
        return dist

In [None]:
data['dist_mean'] = data.apply(lambda row: dist_from_mean(row, 'mean'), axis=1)
data['dist_median'] = data.apply(lambda row: dist_from_mean(row, 'median'), axis=1)

In [None]:
def dist_from_median_rel(row, metric): 
    if pd.isna(row['TransactionAmt']):
        return np.nan
    if pd.isna(row[metric]):
        return np.nan
    else: 
        dist_rel = (row['TransactionAmt'] - row[metric]) / row[metric]
        return dist_rel

In [None]:
data['dist_mean_rel'] = data.apply(lambda row: dist_from_median_rel(row, 'mean'), axis=1)

In [None]:
data['dist_median_rel'] = data.apply(lambda row: dist_from_median_rel(row, 'median'), axis=1)

In [None]:
data.head(20)

# IMPORT 

In [None]:
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LassoCV
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression

# IMPUTER AND ENCODING 

In [None]:
data_new = data.dropna(how='any')

In [None]:
data_new.shape

In [None]:
X = data[['TransactionID','P_emaildomain_suffix','P_emaildomain_bin',
'card1','card2','addr1','TransactionAmt','card5','D15','C13','D2','D10','D4','TransactionDT','weekday','hours','cardID',\
          'mean','max','median','min', 'dist_mean', 'dist_median','dist_mean_rel','dist_median_rel']]
y = data['isFraud']

### New Data without NAn

In [None]:
X = data_new[['TransactionID',
'card1','card2','addr1','TransactionAmt','card5','D15','C13','D2','D10','D4','TransactionDT',\
          'mean','max','median','min', 'dist_mean', 'dist_median','dist_mean_rel','dist_median_rel']]
y = data_new['isFraud']

In [None]:
#data_new.where(data_new == 'nan')

In [None]:
n = (X.dtypes != 'object')
num_cols = list(n[n].index)
medium_missing_num_cols = []
low_missing_num_cols =[]
for i in num_cols:
    percentage = data[i].isnull().sum() * 100 / len(data[i])
    if percentage < 15:
        low_missing_num_cols.append(i)
    elif percentage >= 15 and percentage <= 60:
        medium_missing_num_cols.append(i)

In [None]:
num_transformer_low = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
num_transformer_medium = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#cat_transformer = Pipeline([
    #'imputer', SimpleImputer(strategy='constant', fill_value = "Unknown")
    #])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('one_hot', OneHotEncoder())
])
    
preprocessor = ColumnTransformer([
    ('low_num_imputer',num_transformer_low, low_missing_num_cols),
    ('medium_num_imputer', num_transformer_medium, medium_missing_num_cols),
    ('cat_transformer', cat_pipeline, ['P_emaildomain_suffix','P_emaildomain_bin','weekday','hours', 'cardID'])],
    remainder='passthrough')

pd.DataFrame(preprocessor.fit_transform(X)).head()

In [None]:
data_new.isnull().sum()

## Create Baseline Model 

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=414)

### Create smaller dataset for investigation 

In [None]:
# Create a smaller dataset for investigation purpose only
sample_size = 20000

X_small = X.sample(sample_size, random_state=0)
y_small = y.sample(sample_size, random_state=0)

### Baseline Model 

In [None]:
import os,sys
from scipy import stats

In [None]:
X.dtypes

## Baseline Model 

In [None]:
from sklearn.linear_model import LogisticRegression


log_model = LogisticRegression(class_weight='balanced')

# Train the model on the training data
log_model.fit(X_train, y_train)

# Print the score of the model on the testing data
log_model.score(X_test, y_test)

In [None]:
# create Basemodel: SGDClassifier Logistic Regression
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

sgd_model = SGDClassifier(loss='log', alpha=0.5, class_weight='balanced')
sgd_model.fit(X_train, y_train)

cv_results_sgd_model = cross_val_score(sgd_model, X_train, y_train, cv=5, n_jobs=1, scoring='recall').mean()


In [None]:
cv_results_sgd_model 

In [None]:
# Simple Random Forest 

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(class_weight='balanced', random_state=0)

forest.fit(X_train, y_train)

cv_results_forest = cross_validate(forest, X_train, y_train, cv=5, scoring='f1_macro')
print(cv_results_forest['test_score'].mean())

## Feature Permutation 

In [None]:
# Permutation 

from sklearn.inspection import permutation_importance

permutation_score = permutation_importance(log_model, X_train, y_train, n_repeats=10)

np.vstack((X.columns, permutation_score.importances_mean)).T