# Modeling

In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
sys.path.append("../src")
import utils

rng = np.random.default_rng(10)

## 1. Preprocessing

TODO:
1. Join income and credit limit features from users data
1. Calculate "home city", "home state", "home country", user age, and retired features
1. Calculate card vintage and time to expiration features from card data
1. **What to do with MCCs?** There are 109 of them: too many to just make dummies? Perhaps they could be grouped according to fraud rates, though this would need to be done using only training data.
1. 70/30 train/test split. Because there are user-level measures, I think there is a need to avoid contamination. Also, this will be definitely necessary for alternative modeling methods that explicitly consider the user level.

Transactions

In [2]:
tx_data = pd.read_csv(
    utils.prepend_dir('tx_sample.csv'),
    index_col=0,
    parse_dates=['timestamp']
)
tx_data.drop(['year', 'month', 'day', 'time', 'merchant_name', 'use_chip'], axis=1, inplace=True)
print(tx_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5214807 entries, 28882 to 24270533
Data columns (total 11 columns):
 #   Column          Dtype         
---  ------          -----         
 0   user            int64         
 1   card            int64         
 2   amount          float64       
 3   merchant_city   object        
 4   merchant_state  object        
 5   zip             float64       
 6   mcc             int64         
 7   errors          object        
 8   is_fraud        bool          
 9   timestamp       datetime64[ns]
 10  tx_type         object        
dtypes: bool(1), datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 442.6+ MB
None


Cards

In [3]:
card_data = pd.read_csv(
    utils.prepend_dir('cards_sample.csv'), 
    index_col=0, 
    parse_dates=['expires', 'acct_open_date']
)
print(card_data.info())
card_cols = ['user', 'card_index', 'card_brand', 'card_type', 'expires', 'has_chip', 'cards_issued', 'credit_limit', 'acct_open_date']

<class 'pandas.core.frame.DataFrame'>
Index: 1241 entries, 10 to 6109
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   user                   1241 non-null   int64         
 1   card_index             1241 non-null   int64         
 2   card_brand             1241 non-null   object        
 3   card_type              1241 non-null   object        
 4   card_number            1241 non-null   int64         
 5   expires                1241 non-null   datetime64[ns]
 6   cvv                    1241 non-null   int64         
 7   has_chip               1241 non-null   bool          
 8   cards_issued           1241 non-null   int64         
 9   credit_limit           1241 non-null   float64       
 10  acct_open_date         1241 non-null   datetime64[ns]
 11  year_pin_last_changed  1241 non-null   int64         
 12  num_transactions       1241 non-null   int64         
 13  total_t

Users

In [4]:
user_data = pd.read_csv(
    utils.prepend_dir('users_sample.csv'),
    index_col=0
)
user_data['birthdate'] = pd.to_datetime({
    'year': user_data.birth_year, 
    'month': user_data.birth_month, 
    'day':1
})
print(user_data.info())
user_cols = ['birthdate', 'retirement_age', 'gender' ,'city', 'state', 'zipcode', 'latitude', 'longitude', 'per_capita_income_zipcode', 'yearly_income_person', 'total_debt', 'fico_score', 'num_credit_cards']

<class 'pandas.core.frame.DataFrame'>
Index: 400 entries, 2 to 1987
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   person                     400 non-null    object        
 1   current_age                400 non-null    int64         
 2   retirement_age             400 non-null    int64         
 3   birth_year                 400 non-null    int64         
 4   birth_month                400 non-null    int64         
 5   gender                     400 non-null    object        
 6   address                    400 non-null    object        
 7   apartment                  102 non-null    float64       
 8   city                       400 non-null    object        
 9   state                      400 non-null    object        
 10  zipcode                    400 non-null    int64         
 11  latitude                   400 non-null    float64       
 12  longitude   

Merging data

In [10]:
merged_df = (tx_data
    .merge(
        card_data[card_cols],
        left_on=['user', 'card'],
        right_on=['user', 'card_index'])
    .drop('card_index', axis=1)
    .merge(user_data[user_cols],
        left_on='user',
        right_index=True)
)

merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5214807 entries, 0 to 5214806
Data columns (total 31 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   user                       int64         
 1   card                       int64         
 2   amount                     float64       
 3   merchant_city              object        
 4   merchant_state             object        
 5   zip                        float64       
 6   mcc                        int64         
 7   errors                     object        
 8   is_fraud                   bool          
 9   timestamp                  datetime64[ns]
 10  tx_type                    object        
 11  card_brand                 object        
 12  card_type                  object        
 13  expires                    datetime64[ns]
 14  has_chip                   bool          
 15  cards_issued               int64         
 16  credit_limit               float64       

Constructing features

In [11]:
# location-based features
merged_df['home_city'] = (merged_df.city == merged_df.merchant_city) & (merged_df.state == merged_df.merchant_state)
merged_df['home_state'] = merged_df.state == merged_df.merchant_state
merged_df['home_zip'] = merged_df.zip == merged_df.zipcode
merged_df['overseas'] = (merged_df.tx_type != 'online') & merged_df.zip.isna()
merged_df = merged_df[~(merged_df.merchant_state.isna() & (merged_df.tx_type != 'online'))]

In [12]:
# time-based features
merged_df['user_age'] = (merged_df.timestamp - merged_df.birthdate).dt.days/365
merged_df['retired'] = merged_df.user_age >= merged_df.retirement_age
merged_df['until_expired'] = merged_df.expires - merged_df.timestamp
merged_df['since_opened'] = merged_df.timestamp - merged_df.acct_open_date

In [13]:
redundant_cols = ['merchant_city', 'merchant_state', 'zip', 'city', 'state', 'zipcode', 'birthdate', 'retirement_age', 'expires', 'acct_open_date']
merged_df.drop(redundant_cols, axis=1, inplace=True)

In [17]:
pd.get_dummies(merged_df.sample(1000)).head(4).T

Unnamed: 0,2319280,5060984,3709465,4847906
user,841,1950,1361,1865
card,2,1,2,2
amount,13.02,6.47,22.0,20.85
mcc,5813,5411,7349,5541
is_fraud,False,False,False,False
timestamp,2009-07-18 10:56:00,2014-12-26 16:49:00,1998-11-19 13:17:00,2013-11-15 07:14:00
has_chip,True,True,True,True
cards_issued,1,1,2,1
credit_limit,24819.0,26222.0,24459.0,43287.0
latitude,39.98,43.04,29.43,41.8
