# Modeling

In [20]:
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, PrecisionRecallDisplay, precision_recall_curve
import seaborn as sns
import matplotlib.pyplot as plt
sys.path.append("../src")
import utils

rng = np.random.default_rng(10)

## 1. Preprocessing

TODO:
1. Join income and credit limit features from users data
1. Calculate "home city", "home state", "home country", user age, and retired features
1. Calculate card vintage and time to expiration features from card data
1. **What to do with MCCs?** There are 109 of them: too many to just make dummies? Perhaps they could be grouped according to fraud rates, though this would need to be done using only training data.
1. 70/30 train/test split. Because there are user-level measures, I think there is a need to avoid contamination. Also, this will be definitely necessary for alternative modeling methods that explicitly consider the user level.

Transactions

In [21]:
tx_data = pd.read_csv(
    utils.prepend_dir('tx_sample.csv'),
    index_col=0,
    parse_dates=['timestamp']
)
# tx_data.drop(['year', 'month', 'day', 'time', 'merchant_name', 'use_chip'], axis=1, inplace=True)
print(tx_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5214807 entries, 28882 to 24270533
Data columns (total 11 columns):
 #   Column          Dtype         
---  ------          -----         
 0   user            int64         
 1   card            int64         
 2   amount          float64       
 3   merchant_city   object        
 4   merchant_state  object        
 5   zip             float64       
 6   mcc             int64         
 7   errors          object        
 8   is_fraud        bool          
 9   timestamp       datetime64[ns]
 10  tx_type         object        
dtypes: bool(1), datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 442.6+ MB
None


Cards

In [22]:
card_data = pd.read_csv(
    utils.prepend_dir('cards_sample.csv'), 
    index_col=0, 
    parse_dates=['expires', 'acct_open_date']
)
print(card_data.info())
card_cols = ['user', 'card_index', 'card_brand', 'card_type', 'expires', 'has_chip', 'cards_issued', 'credit_limit', 'acct_open_date']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1241 entries, 10 to 6109
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   user                   1241 non-null   int64         
 1   card_index             1241 non-null   int64         
 2   card_brand             1241 non-null   object        
 3   card_type              1241 non-null   object        
 4   card_number            1241 non-null   int64         
 5   expires                1241 non-null   datetime64[ns]
 6   cvv                    1241 non-null   int64         
 7   has_chip               1241 non-null   bool          
 8   cards_issued           1241 non-null   int64         
 9   credit_limit           1241 non-null   float64       
 10  acct_open_date         1241 non-null   datetime64[ns]
 11  year_pin_last_changed  1241 non-null   int64         
 12  num_transactions       1241 non-null   int64         
 13  to

Users

In [23]:
user_data = pd.read_csv(
    utils.prepend_dir('users_sample.csv'),
    index_col=0
)
user_data['birthdate'] = pd.to_datetime({
    'year': user_data.birth_year, 
    'month': user_data.birth_month, 
    'day':1
})
print(user_data.info())
user_cols = ['birthdate', 'retirement_age', 'gender' ,'city', 'state', 'zipcode', 'latitude', 'longitude', 'per_capita_income_zipcode', 'yearly_income_person', 'total_debt', 'fico_score', 'num_credit_cards']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 2 to 1987
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   person                     400 non-null    object        
 1   current_age                400 non-null    int64         
 2   retirement_age             400 non-null    int64         
 3   birth_year                 400 non-null    int64         
 4   birth_month                400 non-null    int64         
 5   gender                     400 non-null    object        
 6   address                    400 non-null    object        
 7   apartment                  102 non-null    float64       
 8   city                       400 non-null    object        
 9   state                      400 non-null    object        
 10  zipcode                    400 non-null    int64         
 11  latitude                   400 non-null    float64       
 12  longitu

Merging data

In [24]:
merged_df = (tx_data
    .merge(
        card_data[card_cols],
        left_on=['user', 'card'],
        right_on=['user', 'card_index'])
    .drop('card_index', axis=1)
    .merge(user_data[user_cols],
        left_on='user',
        right_index=True)
)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5214807 entries, 0 to 5214806
Data columns (total 31 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   user                       int64         
 1   card                       int64         
 2   amount                     float64       
 3   merchant_city              object        
 4   merchant_state             object        
 5   zip                        float64       
 6   mcc                        int64         
 7   errors                     object        
 8   is_fraud                   bool          
 9   timestamp                  datetime64[ns]
 10  tx_type                    object        
 11  card_brand                 object        
 12  card_type                  object        
 13  expires                    datetime64[ns]
 14  has_chip                   bool          
 15  cards_issued               int64         
 16  credit_limit               float64  

Constructing features

In [25]:
# location-based features
merged_df['home_city'] = (merged_df.city == merged_df.merchant_city) & (merged_df.state == merged_df.merchant_state)
merged_df['home_state'] = merged_df.state == merged_df.merchant_state
merged_df['home_zip'] = merged_df.zip == merged_df.zipcode
merged_df['overseas'] = (merged_df.tx_type != 'online') & merged_df.zip.isna()
merged_df = merged_df[~(merged_df.merchant_state.isna() & (merged_df.tx_type != 'online'))]

In [26]:
# time-based features
merged_df['user_age'] = (merged_df.timestamp - merged_df.birthdate).dt.days/365
merged_df['retired'] = merged_df.user_age >= merged_df.retirement_age
merged_df['until_expired'] = (merged_df.expires - merged_df.timestamp).dt.days
merged_df['since_opened'] = (merged_df.timestamp - merged_df.acct_open_date).dt.days

In [27]:
# dropping unneeded columns
redundant_cols = ['merchant_city', 'merchant_state', 'zip', 'city', 'state', 'zipcode', 'birthdate', 'retirement_age', 'expires', 'acct_open_date', 'timestamp']
merged_df.drop(columns=redundant_cols, inplace=True)

Converting categories to dummies

In [28]:
dummy_df = merged_df.copy()

errors = set()
for entry in merged_df.errors.dropna().unique().tolist():
    for error in entry.split(','):
        errors.add(error)
for error in list(errors):
    dummy_df[error] = dummy_df.errors.str.contains(error).fillna(False)
print(dummy_df.groupby('errors').agg({error:'sum' for error in errors}))
dummy_df.drop(columns='errors', inplace=True)

                                       Bad Expiration  Bad Zipcode  Bad CVV  \
errors                                                                        
Bad CVV                                             0            0     2455   
Bad CVV,Insufficient Balance                        0            0       23   
Bad CVV,Technical Glitch                            0            0        5   
Bad Card Number                                     0            0        0   
Bad Card Number,Bad CVV                             0            0       12   
Bad Card Number,Bad Expiration                      6            0        0   
Bad Card Number,Insufficient Balance                0            0        0   
Bad Card Number,Technical Glitch                    0            0        0   
Bad Expiration                                   2404            0        0   
Bad Expiration,Bad CVV                             14            0       14   
Bad Expiration,Insufficient Balance                1

In [29]:
dummy_df = pd.get_dummies(dummy_df, drop_first=True)
dummy_df.sample().T

Unnamed: 0,3472821
user,1287
card,1
amount,-90.0
mcc,5541
is_fraud,False
has_chip,True
cards_issued,2
credit_limit,21177.0
latitude,35.15
longitude,-89.75


TODO: deal with MCC codes

Training and test data

In [30]:
X_train, X_test, y_train, y_test = train_test_split(dummy_df.drop(columns=['mcc', 'is_fraud']), dummy_df.is_fraud, random_state=10, stratify=dummy_df.is_fraud)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3909902, 36), (1303301, 36), (3909902,), (1303301,))

## Logistic regression

In [31]:
# baseline model with default values

logit_pipe_default = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

logit_pipe_default.fit(X_train, y_train)
y_pred_train = logit_pipe_default.predict(X_train)
y_pred_test = logit_pipe_default.predict(X_test)
print('Training:')
print(classification_report(y_train, y_pred_train))
print('Validation:')
print(classification_report(y_test, y_pred_test))

: 

: 

With such an imbalanced data set, it is not surprising that the model gives very few transactions a majority probability of being fradulent. We can consider whether the model is at all better than guessing by looking at the precision-recall curve.

In [None]:
train_fraud_prob = logit_pipe_default.predict_proba(X_train)[:,1]
test_fraud_prob = logit_pipe_default.predict_proba(X_test)[:,1]

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, test_fraud_prob)
logit_default_prc = pd.DataFrame({
    'precision':precision[:-1],
    'recall': recall[:-1]},
    index=thresholds
)
logit_default_prc[logit_default_prc.recall>.5].tail()

In [None]:
prc_display = PrecisionRecallDisplay.from_estimator(logit_pipe_default, X_test, y_test)
plt.axhline(0.01);
plt.axis([0.5, 1., 0., 0.2]);
plt.legend(loc='upper right');
plt.title("Precision-Recall Curve for Default Logistic Regression");

A naive logistic regression model is better than pure chance, but not by much. To catch about half of fraudulent transaction, a threshold of <1% is needed. At that level, over 40 legitimate transactions would be flagged as fraudulent.