In [31]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../data/credit_transactions.csv")

df.head()

Unnamed: 0,user_id,year,month,day,time,amount,card_method,merchant,merchant_city,merchant_state,...,user_apartment,user_city,user_state,user_zip,user_lat,user_long,user_income,user_debt,user_rating,user_number_of_cards
0,0,2015,11,15,12:55,$287.13,Online Transaction,-8194607650924472520,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
1,0,2015,11,15,13:19,$2.41,Online Transaction,-7759074308363763111,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
2,0,2015,11,16,09:41,$50.81,Online Transaction,-551332107213382088,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
3,0,2015,11,16,09:46,$248.36,Online Transaction,4872340518840476610,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5
4,0,2015,11,16,11:20,$473.00,Online Transaction,-8566951830324093739,ONLINE,,...,,La Verne,CA,91750,34.15,-117.76,$59696,$127613,787,5


# Condensing Transaction Datetime

In [32]:
from datetime import datetime

def condense_time(row):
    x = datetime(
        int(row.year), 
        int(row.month), 
        int(row.day), 
        int(row.time.split(':')[0]), 
        int(row.time.split(':')[1])
    )
    return x

df['purchase_date'] = df.apply(condense_time, axis=1)

df.drop(columns=['year','month','day','time'], inplace=True)

# Fixing Dollar Amounts

In [33]:
df['amount'] = df['amount'].str.replace("$", "").astype(float)

# Adding External MCC Category Names

In [34]:
# read additional dataset for grouped merchant columns 
mcc_df = pd.read_csv('../data/mcc_grouped.csv')

# create the custom function to identify the matching business type for merchant code
def get_mcc_name(code):
    matching_code = mcc_df.loc[(mcc_df['start'] <= code) & (code <= mcc_df['end'])]
    if not matching_code.empty:
        return matching_code['Business type'].iloc[0]
    else:
        return 'Unknown'
    
# Apply the custom function to create a new column in df
df['mcc_group'] = df['merchant_code'].apply(get_mcc_name)

#df.drop(
#    columns=['mcc'], 
#    inplace=True
#)

In [35]:
df['mcc_group'].value_counts()

mcc_group
Retail outlet services                                44586
Miscellaneous stores                                  23902
Transportation services                                9178
Utility services                                       8140
Business services                                      5797
Unknown                                                5098
Clothing stores                                        1671
Professional services and membership organizations      777
Government services                                     598
Contracted services                                      10
Name: count, dtype: int64

In [36]:
# create new boolean column (feature engineering) to check if merchant and user states matches
df['state_match']=df['merchant_state'] == df['user_state']

#check results
selected_columns = ['state_match', 'merchant_state', 'user_state']
result_df = df[selected_columns]
print(result_df)
df['state_match'].value_counts()

       state_match merchant_state user_state
0            False            NaN         CA
1            False            NaN         CA
2            False            NaN         CA
3            False            NaN         CA
4            False            NaN         CA
...            ...            ...        ...
99752         True             CA         CA
99753         True             VA         VA
99754        False            NaN         MD
99755         True             CA         CA
99756         True             NY         NY

[99757 rows x 3 columns]


state_match
True     59622
False    40135
Name: count, dtype: int64

In [37]:
# create new boolean column (feature engineering) to check if merchant and user cities matches
df['city_match']=df['merchant_city'] == df['user_city']

#check results
df['city_match'].value_counts()
selected_columns = ['city_match', 'merchant_city', 'user_city']
result_df = df[selected_columns]
print(result_df)
df['city_match'].value_counts()


       city_match merchant_city     user_city
0           False        ONLINE      La Verne
1           False        ONLINE      La Verne
2           False        ONLINE      La Verne
3           False        ONLINE      La Verne
4           False        ONLINE      La Verne
...           ...           ...           ...
99752       False       Burbank      Van Nuys
99753        True        Lorton        Lorton
99754       False        ONLINE  Gaithersburg
99755       False      San Jose   Watsonville
99756       False      Lynbrook      Freeport

[99757 rows x 3 columns]


city_match
False    59278
True     40479
Name: count, dtype: int64

In [38]:
# display initial condition
print(df['errors'].value_counts())

# split the values in the 'errors' column and create a new DataFrame with dummy variables
errors_dummies = df['errors'].str.get_dummies(sep=',')
errors_dummies

# Giving columns more concise names, allowing us to access via .notation
errors_dummies.rename(
    columns={
        'Insufficient Balance' : 'insuf_balance',
        'Bad PIN'              : 'bad_pin',
        'Bad CVV'              : 'bad_cvv',
        'Bad Card Number'      : 'bad_card_number',
        'Bad Expiration'       : 'bad_expir',
        'Bad Zipcode'          : 'bad_zip',
        'Technical Glitch'     : 'tech_glitch',
    }, inplace= True
)
errors_dummies

# concatenate the new DataFrame with the original DataFrame
df = pd.concat([df, errors_dummies], axis=1)

# drop errors column
df.drop(columns={'errors'}, inplace= True)

# Display the result for bad_pin to check
df['bad_pin'].value_counts()


errors
Insufficient Balance                     1117
Bad PIN                                   475
Bad CVV                                   309
Technical Glitch                          193
Bad Card Number                           151
Bad Expiration                            141
Bad PIN,Insufficient Balance               11
Bad Zipcode                                 7
Bad CVV,Insufficient Balance                4
Bad Expiration,Bad CVV                      2
Bad Expiration,Technical Glitch             2
Bad Card Number,Insufficient Balance        2
Bad PIN,Technical Glitch                    2
Bad CVV,Technical Glitch                    2
Insufficient Balance,Technical Glitch       1
Bad Expiration,Insufficient Balance         1
Bad Zipcode,Insufficient Balance            1
Name: count, dtype: int64


bad_pin
0    99269
1      488
Name: count, dtype: int64