In [84]:
import pandas as pd
import numpy as np
from datetime import datetime

df = pd.read_csv('../data/credit_card_fraud.csv')

# Rename and Drop Columns

In [85]:
# Dropping columns we won't use as they are either obfuscated or of little interest.
df.drop(
    columns=[
        'Cardholder Name',
        'Card Number (Hashed or Encrypted)',
        'CVV Code (Hashed or Encrypted)',
        'Transaction ID',
        'IP Address',
        'Transaction Notes',
        'User Account Information' ],
    inplace=True,
    errors='ingore'
)

# Giving columns more concise names, allowing us to access via .notation
df.rename(
    columns={
        'Transaction Date and Time'                 : 'datetime',
        'Transaction Amount'                        : 'amount',
#        'Cardholder Name'                           : 'cardholder',
        'Merchant Name'                             : 'merchant',
        'Merchant Category Code (MCC)'              : 'mcc',
        'Transaction Location (City or ZIP Code)'   : 'location',
        'Card Expiration Date'                      : 'exp_date',
        'Transaction Currency'                      : 'currency',
        'Card Type'                                 : 'card_type',
        'Card Expiration Date'                      : 'card_exp_date',
        'Transaction Response Code'                 : 'response_code',
        'Fraud Flag or Label'                       : 'flagged_as_fraud',
        'Previous Transactions'                     : 'previous_transactions',
        'Transaction Source'                        : 'source',
        'Device Information'                        : 'device',
#        'User Account Information'                  : 'user_account'
    }, inplace= True
)

# Fix Data Types

In [86]:
df['datetime'] = pd.to_datetime(df['datetime'])
df['flagged_as_fraud'] = df['flagged_as_fraud'].astype('bool')
df['previous_transactions'] = df['previous_transactions'].replace({None:0,'1':1,'2':2,'3 or more':3})
df['response_code'] = df['response_code'].replace({0 : 'Accepted', 12 : 'Declined', 5 : 'Invalid' })
df['card_exp_date'] = df['card_exp_date'].apply(lambda x: datetime.strptime(x, "%m/%y"))

print(df.dtypes)
df.head(5)

datetime                 datetime64[ns]
amount                          float64
merchant                         object
mcc                               int64
location                         object
currency                         object
card_type                        object
card_exp_date            datetime64[ns]
response_code                     int64
flagged_as_fraud                   bool
previous_transactions             int64
source                           object
device                           object
dtype: object


Unnamed: 0,datetime,amount,merchant,mcc,location,currency,card_type,card_exp_date,response_code,flagged_as_fraud,previous_transactions,source,device
0,2022-09-24 13:54:27,285.88,"Rajagopalan, Ghose and Kant",3590,Khammam,INR,MasterCard,2029-04-01,5,True,0,Online,Tablet
1,2020-07-24 11:20:13,1777.32,Sule PLC,7277,Vasai-Virar,EUR,American Express,2026-03-01,12,True,0,Online,Mobile
2,2023-03-18 01:05:36,3939.01,Badal PLC,9297,Nangloi Jat,USD,Visa,2029-11-01,5,True,3,In-Person,Mobile
3,2021-01-07 21:53:04,376.44,Konda-Sodhi,5686,Ramagundam,USD,Visa,2025-09-01,5,False,2,In-Person,Tablet
4,2021-12-16 06:22:24,1687.33,Dua Ltd,2940,Adoni,INR,MasterCard,2031-03-01,5,True,2,Online,Desktop


# Creating MCC groups

In [87]:
mcc_df = pd.read_csv('../data/mcc_grouped.csv')

def get_mcc_name(code):
    matching_code = mcc_df.loc[(mcc_df['start'] <= code) & (code <= mcc_df['end'])]
    if not matching_code.empty:
        return matching_code['Business type'].iloc[0]
    else:
        return 'Unknown'
    
# Apply the custom function to create a new column in df
df['mcc_group'] = df['mcc'].apply(get_mcc_name)

df.drop(
    columns=['mcc'], 
    inplace=True
)

Business type    object
start             int64
end               int64
dtype: object

# Outlier Removal

In [89]:
df.dtypes


datetime                 datetime64[ns]
amount                          float64
merchant                         object
mcc                               int64
location                         object
currency                         object
card_type                        object
card_exp_date            datetime64[ns]
response_code                     int64
flagged_as_fraud                   bool
previous_transactions             int64
source                           object
device                           object
mcc_group                        object
dtype: object