Data Collection

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
!pip install haversine
import haversine as hs
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv(r"C:\Users\Melvin Wong\DataScience\Springboard\Github\Dataset\archive\fraudTest.csv")
df.head()

Data definition

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
#unique number of CC with fraud
df.groupby('cc_num')['is_fraud'].nunique()

Data Cleaning

In [None]:
df.columns

In [None]:
df.drop(columns=['Unnamed: 0','street','state','first','last','trans_num','unix_time'],inplace=True)

In [None]:
df.columns

In [None]:
df['dob'] = pd.to_datetime(df['dob'],errors= 'coerce')
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'],errors= 'coerce').dt.dayofweek

In [None]:
df

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['category_label'] = label_encoder.fit_transform(df['category'])
df['category'].value_counts()

In [None]:
#counts the null values for each column 
df.isnull().sum()

In [None]:
#moving the date next to the trans_date_trans_time
#df['trans_date'] = pd.to_datetime(df['trans_date_trans_time'])
#df['trans_date'] = df['trans_date'].dt.date
#column_to_move = df['trans_date']
#f.insert(1, 'date', column_to_move)
#df

In [None]:
import pandas as pd
from datetime import datetime, date
def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

# Convert the birthdate column to datetime
df['birthdate'] = pd.to_datetime(df['dob'])

# Apply the calculate_age function
df['age'] = df['birthdate'].apply(calculate_age).astype(int)


In [None]:
df= df.drop(columns=['birthdate', 'dob'])

In [None]:
df

In [None]:
#function to calculate distance
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

In [None]:
#calculating the dstiance from merchant location and customer location
distances = []
for index, row in df.iterrows():
    customer_lat = row['lat']
    customer_lon = row['long']
    merchant_lat = row['merch_lat']
    merchant_lon = row['merch_long']
    distance = haversine(customer_lat, customer_lon, merchant_lat, merchant_lon)
    distances.append(distance)
df['distance'] = distances

In [None]:
df

In [None]:
#count of average per category
df1 = df[['category','amt']]
df1.groupby(['category']).mean()

In [None]:
#count of fraud per category
df['category'].value_counts()

Exploratory Data Analysis


In [None]:
plt.figure(figsize=(8,4))
ax = sns.countplot(data = df, x = 'category')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# % of fraud 
fraud_df = df[df['is_fraud']==1]
fraud_df.dtypes
fraud_df['category']
ax1 = sns.countplot(data = fraud_df, x= 'category')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
fraud_df

In [None]:
fraud_df['distance'].describe()

In [None]:
shopping_net_df = fraud_df[fraud_df['category'] == 'shopping_net']

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(shopping_net_df['distance'], bins=10, kde=True, color='skyblue')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.title('Histogram of Distance for "shopping_net" Category')
plt.grid(True)
plt.show()

In [None]:
grocery_pos_df = fraud_df[fraud_df['category'] == 'grocery_pos']

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(grocery_pos_df['distance'], bins=10, kde=True, color='skyblue')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.title('Histogram of Distance for "grocery" Category')
plt.grid(True)
plt.show()

In [None]:
misc_net_df = fraud_df[fraud_df['category'] == 'misc_net']

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(misc_net_df['distance'], bins=10, kde=True, color='skyblue')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.title('Histogram of Distance for "shopping_net" Category')
plt.grid(True)
plt.show()

In [None]:
from datetime import datetime

def day_of_week(date_str):
    # Assuming date_str is in the format 'YYYY-MM-DD'
    date_object = datetime.strptime(date_str, '%Y-%m-%d')
    day_index = date_object.weekday()
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    return days[day_index]

In [None]:
df

In [None]:
cleaned_df = df.dropna()

In [None]:
cleaned_df.dtypes

Pre-processing and Training Data Development

In [None]:
cleaned_df

In [None]:
cleaned_df.dtypes

In [None]:
cleaned_df = cleaned_df.drop(columns=['trans_date_trans_time', 'merchant','category','gender', 'city','zip', 'job', 'city_pop','merch_lat','merch_long'])


In [None]:
cleaned_df.dtypes

In [None]:
X  = cleaned_df

In [None]:
y = cleaned_df.is_fraud

In [None]:
#Import ML models:

from sklearn.model_selection import train_test_split, learning_curve 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Splitting the dataset into training and test set:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling:

scaler_x = MinMaxScaler((-1,1))
X_train = scaler_x.fit_transform(X_train)
X_test = scaler_x.transform(X_test)

In [None]:
df_dummies = pd.get_dummies(cleaned_df, columns=['is_fraud'], drop_first=True)

In [None]:
scaler = StandardScaler()

In [None]:
scaled_features = scaler.fit_transform(df_dummies)

In [None]:
df_scaled = pd.DataFrame(scaled_features, columns=df_dummies.columns)

In [None]:
X = cleaned_df.drop(columns=['is_fraud'])
y = cleaned_df['is_fraud']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print("X_train:\n", X_train.head())
print("X_test:\n", X_test.head())
print("y_train:\n", y_train.head())
print("y_test:\n", y_test.head())