In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/transactions_india.csv')

# Dropping columns we won't use as they are either obfuscated or of little interest.
df.drop(
    columns=[
        'Cardholder Name',
        'Card Number (Hashed or Encrypted)',
        'CVV Code (Hashed or Encrypted)',
        'Transaction ID',
        'IP Address',
        'Card Expiration Date',
        'Transaction Notes',
        'User Account Information' ],
    inplace=True
)

# Giving columns more concise names, allowing us to access via .notation
df.rename(
    columns={
        'Transaction Date and Time'                 : 'datetime',
        'Transaction Amount'                        : 'amount',
#        'Cardholder Name'                           : 'cardholder',
        'Merchant Name'                             : 'merchant',
        'Merchant Category Code (MCC)'              : 'mcc',
        'Transaction Location (City or ZIP Code)'   : 'location',
        'Transaction Currency'                      : 'currency',
        'Card Type'                                 : 'card_type',
        'Card Expiration Date'                      : 'card_exp_date',
        'Transaction Response Code'                 : 'response_code',
        'Fraud Flag or Label'                       : 'flagged_as_fraud',
        'Previous Transactions'                     : 'previous_transactions',
        'Transaction Source'                        : 'source',
        'Device Information'                        : 'device',
#        'User Account Information'                  : 'user_account'
    }, inplace= True
)
print(df.shape)
df.dtypes

(8000, 12)


datetime                  object
amount                   float64
merchant                  object
mcc                        int64
location                  object
currency                  object
card_type                 object
response_code              int64
flagged_as_fraud           int64
previous_transactions     object
source                    object
device                    object
dtype: object

In [3]:
df.head()
# This data only covers cities in India over a period of three years. We may want a broader dataset

Unnamed: 0,datetime,amount,merchant,mcc,location,currency,card_type,response_code,flagged_as_fraud,previous_transactions,source,device
0,2022-09-24 13:54:27,285.88,"Rajagopalan, Ghose and Kant",3590,Khammam,INR,MasterCard,5,1,,Online,Tablet
1,2020-07-24 11:20:13,1777.32,Sule PLC,7277,Vasai-Virar,EUR,American Express,12,1,,Online,Mobile
2,2023-03-18 01:05:36,3939.01,Badal PLC,9297,Nangloi Jat,USD,Visa,5,1,3 or more,In-Person,Mobile
3,2021-01-07 21:53:04,376.44,Konda-Sodhi,5686,Ramagundam,USD,Visa,5,0,2,In-Person,Tablet
4,2021-12-16 06:22:24,1687.33,Dua Ltd,2940,Adoni,INR,MasterCard,5,1,2,Online,Desktop


In [4]:

df['datetime'] = pd.to_datetime(df['datetime'])

df.describe()

Unnamed: 0,amount,mcc,response_code,flagged_as_fraud
count,8000.0,8000.0,8000.0,8000.0
mean,2496.356036,5484.150375,5.6375,0.498625
std,1451.221326,2608.164617,4.928147,0.500029
min,1.09,1000.0,0.0,0.0
25%,1242.58,3230.75,0.0,0.0
50%,2492.46,5455.0,5.0,0.0
75%,3739.5225,7761.0,12.0,1.0
max,4996.7,9999.0,12.0,1.0


In [5]:
df['location'].value_counts()

Ghaziabad     53
Aurangabad    51
Darbhanga     39
Bettiah       38
Malda         37
              ..
Hosur         15
Kumbakonam    15
Sagar         15
Rampur        13
Jaunpur       11
Name: location, Length: 316, dtype: int64

In [6]:
df['flagged_as_fraud'].value_counts()

0    4011
1    3989
Name: flagged_as_fraud, dtype: int64