In [24]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
df : pd.DataFrame = pd.read_csv("../data/credit_transactions.csv", dtype={'merchant_zip':str})

df.columns
df.head()

Unnamed: 0,user_id,year,month,day,time,amount,card_method,merchant,merchant_city,merchant_state,...,user_apartment,user_city,user_state,user_zip,user_lat,user_long,user_income,user_debt,user_rating,user_number_of_cards
0,0,2015,11,15,12:55,$287.13,Online Transaction,-8194607650924472520,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
1,0,2015,11,15,13:19,$2.41,Online Transaction,-7759074308363763111,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
2,0,2015,11,16,09:41,$50.81,Online Transaction,-551332107213382088,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
3,0,2015,11,16,09:46,$248.36,Online Transaction,4872340518840476610,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
4,0,2015,11,16,11:20,$473.00,Online Transaction,-8566951830324093739,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5


# Dropping Useless Columns

In [25]:
df.drop(columns=[
    'merchant',     # It's been obsfucated
    'merchant_zip', # We don't need it
    'user_address', # We don't need it
    'user_apartment', # We don't need it
    'user_zip',         # We don't need it
    'user_lat',
    'user_long',
], inplace=True)

# Fixing mm/ yy date formats

In [26]:
df['card_expiry_date'].value_counts()
df[['card_expiry_date', 'card_issue_date', 'card_last_pin']].dtypes

card_expiry_date    object
card_issue_date     object
card_last_pin        int64
dtype: object

In [27]:
df['card_expiry_date'] = df['card_expiry_date'].apply(lambda x: datetime.strptime(x, "%m/%Y"))
df['card_issue_date'] = df['card_issue_date'].apply(lambda x: datetime.strptime(x, "%m/%Y"))
df['card_last_pin'] = df['card_last_pin'].astype(str).apply(lambda x: datetime.strptime(str(x), "%Y"))

df[['card_expiry_date', 'card_issue_date', 'card_last_pin']].dtypes

card_expiry_date    datetime64[ns]
card_issue_date     datetime64[ns]
card_last_pin       datetime64[ns]
dtype: object

# Fixing Easy Dtypes

In [28]:
df['flagged_as_fraud'] = df['flagged_as_fraud'].replace({'Yes':True, 'No':False})

# Condensing Transaction Datetime

In [29]:
from datetime import datetime

def condense_time(row):
    x = datetime(
        int(row.year), 
        int(row.month), 
        int(row.day), 
        int(row.time.split(':')[0]), 
        int(row.time.split(':')[1])
    )
    return x

df['purchase_date'] = df.apply(condense_time, axis=1)

df.drop(columns=['year','month','day','time'], inplace=True)

# Fixing Dollar Amounts

In [30]:
dedollarizer = lambda s: float(s.lstrip("$"))

df['amount'] = df['amount'].apply(dedollarizer)
df['user_income'] = df['user_income'].apply(dedollarizer)
df['user_debt'] = df['user_debt'].apply(dedollarizer)
df['card_limit'] = df['card_limit'].apply(dedollarizer)

df[['amount', 'user_income', 'user_debt', 'card_limit']]


ValueError: could not convert string to float: '$287.13'

# Adding External MCC Category Names

In [None]:
mcc_df = pd.read_csv('../data/mcc_grouped.csv')

def get_mcc_name(code):
    matching_code = mcc_df.loc[(mcc_df['start'] <= code) & (code <= mcc_df['end'])]
    if not matching_code.empty:
        return matching_code['Business type'].iloc[0]
    else:
        return 'Unknown'
    
# Apply the custom function to create a new column in df
df['mcc_group'] = df['merchant_code'].apply(get_mcc_name)

df.drop(
    columns=['merchant_code'], 
    errors='ignore',
    inplace=True
)

In [None]:
df.to_csv('../data/cleaned.csv', index=False)