Assessment notebook 1
Data source: https://www.kaggle.com/datasets/priyamchoksi/credit-card-transactions-dataset/data

In [15]:
# Import packages
import pandas as pd
import random
from datetime import datetime
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [16]:
# Read data
df = pd.read_csv("credit_card_transactions.csv")

df.shape

(1048575, 24)

In [17]:
# data = df.sample(n= 250000, random_state = 48)
data = df.copy()

In [18]:
# drop unneeded columns
data = data.drop(columns=['Unnamed: 0','cc_num','first','last','street','city','state','zip','lat','long','city_pop','trans_num','unix_time','merch_lat','merch_long','merch_zipcode'])
data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,job,dob,is_fraud
0,1/1/2019 0:00,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,"Psychologist, counselling",3/9/1988,0
1,1/1/2019 0:00,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Special educational needs teacher,6/21/1978,0
2,1/1/2019 0:00,fraud_Lind-Buckridge,entertainment,220.11,M,Nature conservation officer,1/19/1962,0
3,1/1/2019 0:01,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Patent attorney,1/12/1967,0
4,1/1/2019 0:03,fraud_Keeling-Crist,misc_pos,41.96,M,Dance movement psychotherapist,3/28/1986,0


In [19]:
# remove null values
data = data.dropna()

In [20]:
# convert dob to age
def dob_to_age(dob):
    return datetime.now().year - dob.year

data['dob'] = pd.to_datetime(data['dob'])
data['age'] = data['dob'].apply(dob_to_age)
# df['age'] = df['dob'].apply(lambda x: datetime.now().year - x.year)

data = data.drop(columns=['dob'])
data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,job,is_fraud,age
0,1/1/2019 0:00,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,"Psychologist, counselling",0,37
1,1/1/2019 0:00,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,Special educational needs teacher,0,47
2,1/1/2019 0:00,fraud_Lind-Buckridge,entertainment,220.11,M,Nature conservation officer,0,63
3,1/1/2019 0:01,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,Patent attorney,0,58
4,1/1/2019 0:03,fraud_Keeling-Crist,misc_pos,41.96,M,Dance movement psychotherapist,0,39


In [21]:
data['category'].value_counts()

category
gas_transport     106430
grocery_pos        99906
home               99578
shopping_pos       94353
kids_pets          91404
shopping_net       78899
entertainment      75981
food_dining        74041
personal_care      73498
health_fitness     69362
misc_pos           64492
misc_net           51082
grocery_net        36719
travel             32830
Name: count, dtype: int64

In [22]:
# map each category to a numerical value

category_mapping = {
    'gas_transport': 1,
    'grocery_pos': 2,
    'home': 3,
    'shopping_pos': 4,
    'kids_pets': 5,
    'shopping_net': 6,
    'entertainment': 7,
    'food_dining': 8,
    'personal_care': 9,
    'health_fitness': 10,
    'misc_pos': 11,
    'misc_net': 12,
    'grocery_net': 13,
    'travel': 14
}
data['category'] = data['category'].map(category_mapping)

In [23]:
# Map each gender value to a numerical value
sex_mapping = {'F':1, 'M':2}

# create SexGroup column
data['gender']=data['gender'].map(sex_mapping)

In [24]:
# break down transaction timestamp into day and month
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'], errors='coerce')
data['transaction_month'] = data['trans_date_trans_time'].dt.month
data['transaction_day'] = data['trans_date_trans_time'].dt.day

In [25]:
# Example bin ranges (adjust if needed)
bins = [0, 150, 500, float('inf')]
labels = ['1', '2', '3']
# labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

# Create a new classification target
data['amt_class'] = pd.cut(data['amt'], bins=bins, labels=labels)
# data['amt_class'] = pd.cut(data['amt'], bins=3, labels=labels)
data = data.dropna(subset=['amt_class'])
data.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,job,is_fraud,age,transaction_month,transaction_day,amt_class
0,2019-01-01 00:00:00,"fraud_Rippin, Kub and Mann",12,4.97,1,"Psychologist, counselling",0,37,1,1,1
1,2019-01-01 00:00:00,"fraud_Heller, Gutmann and Zieme",2,107.23,1,Special educational needs teacher,0,47,1,1,1
2,2019-01-01 00:00:00,fraud_Lind-Buckridge,7,220.11,2,Nature conservation officer,0,63,1,1,2
3,2019-01-01 00:01:00,"fraud_Kutch, Hermiston and Farrell",1,45.0,2,Patent attorney,0,58,1,1,1
4,2019-01-01 00:03:00,fraud_Keeling-Crist,11,41.96,2,Dance movement psychotherapist,0,39,1,1,1


In [26]:
# fit models with 80% of the training data and  predict for 20% of the training data.
predictors = data.drop(['amt','amt_class','merchant','job','trans_date_trans_time'], axis=1)
target = data["amt_class"]
x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.20, random_state = 48)

In [27]:
predictors.head()

Unnamed: 0,category,gender,is_fraud,age,transaction_month,transaction_day
0,12,1,0,37,1,1
1,2,1,0,47,1,1
2,7,2,0,63,1,1
3,1,2,0,58,1,1
4,11,2,0,39,1,1


In [28]:
# Build RandomForestClassifier pipeline
rfc_pipeline = Pipeline(steps=[
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=48))
])

# Train the model
rfc_pipeline.fit(x_train, y_train)

# Predict and evaluate
y_pred = rfc_pipeline.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)

print("Predictions:", y_pred)
print("Actual Labels:", y_val)
print("Test Accuracy:", accuracy)

Predictions: ['1' '1' '1' ... '1' '1' '1']
Actual Labels: 202104    1
494572    1
130746    1
345877    1
205029    1
         ..
513090    1
382516    1
397664    2
433410    1
488037    1
Name: amt_class, Length: 209715, dtype: category
Categories (3, object): ['1' < '2' < '3']
Test Accuracy: 0.9093865484109387
