In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
df : pd.DataFrame = pd.read_csv("../data/credit_transactions.csv")

print(df.columns)
df.head()

# Dropping Useless Columns

In [None]:
df.drop(columns=[
    'merchant',     # It's been obsfucated
    'merchant_zip', # We don't need it
    'user_address', # We don't need it
    'user_apartment', # We don't need it
    'user_zip',         # We don't need it
    'user_lat',
    'user_long',
], inplace=True)

# Fixing mm/ yy date formats

In [None]:
df['card_expiry_date'].value_counts()
df[['card_expiry_date', 'card_issue_date', 'card_last_pin']].dtypes

In [None]:
df['card_expiry_date'] = df['card_expiry_date'].apply(lambda x: datetime.strptime(x, "%m/%Y"))
df['card_issue_date'] = df['card_issue_date'].apply(lambda x: datetime.strptime(x, "%m/%Y"))
df['card_last_pin'] = df['card_last_pin'].astype(str).apply(lambda x: datetime.strptime(str(x), "%Y"))

df[['card_expiry_date', 'card_issue_date', 'card_last_pin']].dtypes

# Fixing Easy Dtypes

In [None]:
df['flagged_as_fraud'] = df['flagged_as_fraud'].replace({'Yes':True, 'No':False})

# Condensing Transaction Datetime

In [None]:
from datetime import datetime

def condense_time(row):
    x = datetime(
        int(row.year), 
        int(row.month), 
        int(row.day), 
        int(row.time.split(':')[0]), 
        int(row.time.split(':')[1])
    )
    return x

df['purchase_date'] = df.apply(condense_time, axis=1)

df.drop(columns=['year','month','day','time'], inplace=True)

# Fixing Dollar Amounts

In [None]:
dedollarizer = lambda s: float(s.lstrip("$"))

df['amount'] = df['amount'].apply(dedollarizer)
df['user_income'] = df['user_income'].apply(dedollarizer)
df['user_debt'] = df['user_debt'].apply(dedollarizer)
df['card_limit'] = df['card_limit'].apply(dedollarizer)

df[['amount', 'user_income', 'user_debt', 'card_limit']]


# Adding External MCC Category Names

In [None]:
# read additional dataset for grouped merchant columns 
mcc_df = pd.read_csv('../data/mcc_grouped.csv')

# create the custom function to identify the matching business type for merchant code
def get_mcc_name(code):
    matching_code = mcc_df.loc[(mcc_df['start'] <= code) & (code <= mcc_df['end'])]
    if not matching_code.empty:
        return matching_code['Business type'].iloc[0]
    else:
        return 'Unknown'
    
# Apply the custom function to create a new column in df
df['mcc_group'] = df['merchant_code'].apply(get_mcc_name)


In [None]:
# create new boolean column (feature engineering) to check if merchant and user states matches
df['state_match']=df['merchant_state'] == df['user_state']

#check results
selected_columns = ['state_match', 'merchant_state', 'user_state']
result_df = df[selected_columns]
print(result_df)
df['state_match'].value_counts()

In [None]:
# create new boolean column (feature engineering) to check if merchant and user cities matches
df['city_match'] = df['merchant_city'] == df['user_city']

#check results
df['city_match'].value_counts()
selected_columns = ['city_match', 'merchant_city', 'user_city']
result_df = df[selected_columns]
print(result_df)
df['city_match'].value_counts()

In [None]:
# display initial condition
print(df['errors'].value_counts())

# save absolute error column
df['error'] = df['errors'].fillna(False).astype(bool)

# split the values in the 'errors' column and create a new DataFrame with dummy variables
errors_dummies = df['errors'].str.get_dummies(sep=',')
errors_dummies

# Giving columns more concise names, allowing us to access via .notation
errors_dummies.rename(
    columns={
        'Insufficient Balance' : 'insuf_balance',
        'Bad PIN'              : 'bad_pin',
        'Bad CVV'              : 'bad_cvv',
        'Bad Card Number'      : 'bad_card_number',
        'Bad Expiration'       : 'bad_expir',
        'Bad Zipcode'          : 'bad_zip',
        'Technical Glitch'     : 'tech_glitch',
    }, inplace= True
)
errors_dummies

# concatenate the new DataFrame with the original DataFrame
df = pd.concat([df, errors_dummies], axis=1)

# drop errors column
df.drop(columns={'errors'}, inplace= True)

# Display the result for bad_pin to check
df['bad_pin'].value_counts()

In [None]:
# create new column (feature engineering) with the difference between user actual age and retirement age
df['retirement_age']=(df.user_retirement - df.user_age).astype(int)

#check if everything ok
df['retirement_age'].sort_values()

# Add categorical version
df['is_retired'] = (df.user_retirement - df.user_age)<=0 

In [None]:
# drop retirement age column
df.drop(columns={'user_retirement'}, inplace= True)

# Card Age at time of purchase

In [None]:
# create new column (feature engineering) with the card age to operation date
df['card_age']=(df.purchase_date - df.card_issue_date)

# Use boolean indexing to filter rows where card_age is not negative since negative results do not make sense 
df = df[df['card_age'] >= pd.Timedelta(0)]

# for result checking use boolean indexing to filter rows where purchase_date < card_issue_date
filtered_rows = df[df['purchase_date'] < df['card_issue_date']]

# Display the result
print(filtered_rows['purchase_date'], filtered_rows['card_issue_date'])

# Looking at purchase date vs Expiry date

In [None]:
display_cols = ['purchase_date', 'card_expiry_date', 'flagged_as_fraud', 'error']

#legal   = df.loc[df['purchase_date']<=df['card_expiry_date']]
#illegal = df.loc[df['purchase_date']>=df['card_expiry_date']]


df['is_card_expired'] = df['purchase_date']>= df['card_expiry_date']

# Last Pin Change

In [None]:
# create new column (feature engineering) with the years of operation date to the last pin
df['since_last_pin']=(df.purchase_date - df.card_last_pin)

# results could be negative 

# Log Amount

In [None]:
import math

def logarithm(val):
    try:
        return math.log(abs(val))
    except:
        return val

df['log_amount'] = df['amount'].apply(logarithm)

In [None]:
df.to_csv('../data/cleaned.csv', index=False)