Data Collection

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!pip install haversine
import haversine as hs

In [None]:
df = pd.read_csv(r"C:\Users\Melvin Wong\DataScience\Springboard\Github\Dataset\archive\fraudTest.csv")
df.head()

Data definition

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
#unique number of CC with fraud
df.groupby('cc_num')['is_fraud'].nunique()

Data Cleaning

In [None]:
df.columns

In [None]:
df.drop(columns=['Unnamed: 0','street','state','first','last','trans_num','unix_time'],inplace=True)

In [None]:
df.columns

In [None]:
#counts the null values for each column 
df.isnull().sum()

In [None]:
#moving the date next to the trans_date_trans_time
df['trans_date'] = pd.to_datetime(df['trans_date_trans_time'])
df['trans_date'] = df['trans_date'].dt.date
column_to_move = df['trans_date']
df.insert(1, 'date', column_to_move)
df

In [None]:
#function to calculate distance
from math import radians, cos, sin, asin, sqrt

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

In [None]:
#calculating the dstiance from merchant location and customer location
distances = []
for index, row in df.iterrows():
    customer_lat = row['lat']
    customer_lon = row['long']
    merchant_lat = row['merch_lat']
    merchant_lon = row['merch_long']
    distance = haversine(customer_lat, customer_lon, merchant_lat, merchant_lon)
    distances.append(distance)
df['distance'] = distances

In [None]:
df

In [None]:
#max years and min years
print(np.max(df['date']))
print(np.min(df['date']))

In [None]:
#count of average per category
df1 = df[['category','amt']]
df1.groupby(['category']).mean()

In [None]:
#count of fraud per category
df['category'].value_counts()

Exploratory Data Analysis


In [None]:
plt.figure(figsize=(8,4))
ax = sns.countplot(data = df, x = 'category')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# % of fraud 
fraud_df = df[df['is_fraud']==1]
fraud_df.dtypes
fraud_df['category']
ax1 = sns.countplot(data = fraud_df, x= 'category')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
fraud_df

In [None]:
fraud_df['distance'].describe()

In [None]:
shopping_net_df = fraud_df[fraud_df['category'] == 'shopping_net']

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(shopping_net_df['distance'], bins=10, kde=True, color='skyblue')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.title('Histogram of Distance for "shopping_net" Category')
plt.grid(True)
plt.show()

In [None]:
grocery_pos_df = fraud_df[fraud_df['category'] == 'grocery_pos']

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(grocery_pos_df['distance'], bins=10, kde=True, color='skyblue')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.title('Histogram of Distance for "shopping_net" Category')
plt.grid(True)
plt.show()

In [None]:
misc_net_df = fraud_df[fraud_df['category'] == 'misc_net']

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(misc_net_df['distance'], bins=10, kde=True, color='skyblue')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.title('Histogram of Distance for "shopping_net" Category')
plt.grid(True)
plt.show()

In [None]:
from datetime import datetime

def day_of_week(date_str):
    # Assuming date_str is in the format 'YYYY-MM-DD'
    date_object = datetime.strptime(date_str, '%Y-%m-%d')
    day_index = date_object.weekday()
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    return days[day_index]

In [None]:
import pandas as pd

day_of_week_list = []
for index, row in fraud_df.iterrows():
    date_object = row['date']  # Assuming 'date' is the column containing datetime.date objects
    day_index = date_object.weekday()
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    day_of_week_list.append(days[day_index])

# Create a new DataFrame with the 'day_of_week' column
df = pd.DataFrame(day_of_week_list, columns=['day_of_week'])

# Concatenate the new DataFrame with the original fraud_df
df = pd.concat([fraud_df, df], axis=1)

In [None]:
df

In [None]:
cleaned_df = df.dropna()

In [None]:
cleaned_df

In [None]:
day_of_week_counts = df['day_of_week'].value_counts()

In [None]:
plt.bar(day_of_week_counts.index, day_of_week_counts.values)
plt.xlabel('Day of Week')
plt.ylabel('Count')
plt.title('Occurrences of Each Day of the Week')
plt.xticks(rotation=45)

plt.show()

Pre-processing and Training Data Development

In [None]:
X_train, X_test, y_train, y_test = train_test_split(ski_data.drop(columns='AdultWeekend'), 
                                                    ski_data.AdultWeekend, test_size=0.3, 
                                                    random_state=47)