In [1]:
# Fraud Transaction Detection

# Library Imports

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

!pip install category_encoders
from category_encoders import WOEEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

sns.set_style('whitegrid')  # Set the Seaborn plot style to 'whitegrid'
sns.set_palette('pastel')   # Set the color palette to 'pastel'

import warnings
warnings.simplefilter("ignore")  # Suppress warnings

# Data Loading

from google.colab import drive
drive.mount('/content/drive')

!ls /content/drive/My\ Drive/Colab/

file_path_train = '/content/drive/My Drive/Colab/fraudTrain.csv'
file_path_test = '/content/drive/My Drive/Colab/fraudTest.csv'

train_df = pd.read_csv(file_path_train, index_col='Unnamed: 0')
test_df = pd.read_csv(file_path_test, index_col='Unnamed: 0')

print(train_df.head(3))
print(train_df.info())
print(f"Shape of training data: {train_df.shape}")

fraud_counts = train_df["is_fraud"].value_counts()
print(f"Fraudulent Transactions: {fraud_counts[1]}")
print(f"Non-Fraudulent Transactions: {fraud_counts[0]}")

print(f"Total missing values: {train_df.isna().sum().sum()}")
print(f"Total duplicate rows: {train_df.duplicated().sum()}")

fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# Gender Distribution
explode = [0.1, 0.1]
train_df.groupby('gender')['is_fraud'].count().plot.pie(explode=explode, autopct="%1.1f%%", ax=ax[0])

# Count plot
gender_plot = sns.countplot(x="gender", hue="is_fraud", data=train_df, ax=ax[1])
for patch in gender_plot.patches:
    gender_plot.annotate(f'{patch.get_height()}', (patch.get_x() + patch.get_width() / 2., patch.get_height()),
                         ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.title("Gender Distribution with Fraud Status")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

print(f"Distribution of fraud cases: {fraud_counts}")
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.pie(fraud_counts, labels=["No Fraud", "Fraud"], autopct="%0.0f%%")
plt.title("Fraud Distribution")
plt.tight_layout()
plt.show()

print("The dataset is highly imbalanced, with 99% non-fraudulent transactions.")

# Feature Engineering

# Convert date columns to datetime
train_df['trans_date_trans_time'] = pd.to_datetime(train_df['trans_date_trans_time'])
test_df['trans_date_trans_time'] = pd.to_datetime(test_df['trans_date_trans_time'])

# Extract hour and month
train_df['hour'] = train_df['trans_date_trans_time'].dt.hour
test_df['hour'] = test_df['trans_date_trans_time'].dt.hour

train_df['month'] = train_df['trans_date_trans_time'].dt.month
test_df['month'] = test_df['trans_date_trans_time'].dt.month

print(train_df.head())

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5), sharey=True)

ax1 = sns.histplot(x='hour', data=train_df[train_df["is_fraud"] == 0],
                   stat="density", bins=24, ax=ax1, color="orange")

ax2 = sns.histplot(x='hour', data=train_df[train_df["is_fraud"] == 1],
                   stat="density", bins=24, ax=ax2, color="green")

ax1.set_title("Non-Fraudulent Transactions")
ax2.set_title("Fraudulent Transactions")

ax1.set_xticks(np.arange(24))
ax2.set_xticks(np.arange(24))

plt.show()

print("Fraudulent transactions are predominantly occurring around midnight.")

unique_transactions = len(train_df['trans_num'].unique())
print(f"Unique transaction count: {unique_transactions}")

# Drop irrelevant columns
columns_to_remove = ['first', 'unix_time', 'dob', 'cc_num', 'zip', 'city', 'street', 'state', 'trans_num', 'trans_date_trans_time']
train_df.drop(columns=columns_to_remove, inplace=True)
test_df.drop(columns=columns_to_remove, inplace=True)

# Clean merchant names
train_df['merchant'] = train_df['merchant'].str.replace('fraud_', '')

print(train_df.info())
print(train_df.head(2))

# Data Encoding

# Apply label encoding to 'gender'
train_df['gender'] = train_df['gender'].map({'F': 0, 'M': 1})

# Apply Weight of Evidence (WOE) encoding
encoder = WOEEncoder()
for column in ['job', 'merchant', 'category', 'lat', 'last']:
    train_df[column] = encoder.fit_transform(train_df[column], train_df['is_fraud'])

print(train_df.head(3))

# Resampling to address class imbalance

non_fraud_class = train_df[train_df["is_fraud"] == 0]
fraud_class = train_df[train_df["is_fraud"] == 1]

non_fraud_class_downsampled = resample(non_fraud_class, replace=False, n_samples=len(fraud_class))
balanced_data = pd.concat([fraud_class, non_fraud_class_downsampled])

X = balanced_data.drop("is_fraud", axis=1)
y = balanced_data["is_fraud"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=65)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display class distributions
original_counts = train_df["is_fraud"].value_counts()
downsampled_counts = balanced_data["is_fraud"].value_counts()

original_percentages = original_counts / len(train_df) * 100
downsampled_percentages = downsampled_counts / len(balanced_data) * 100

plt.figure(figsize=(12, 6))

# Original class distribution
plt.subplot(1, 2, 1)
bars = plt.bar(original_counts.index, original_counts.values, color=['orange', 'green'])
for bar, label in zip(bars, original_percentages):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5, f'{label:.2f}%', ha='center', va='bottom')
plt.title('Original Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(original_counts.index, ['No Fraud', 'Fraud'])

# Downsampled class distribution
plt.subplot(1, 2, 2)
bars = plt.bar(downsampled_counts.index, downsampled_counts.values, color=['orange', 'green'])
for bar, label in zip(bars, downsampled_percentages):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 5, f'{label:.2f}%', ha='center', va='bottom')
plt.title('Downsampled Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(downsampled_counts.index, ['No Fraud', 'Fraud'])

plt.tight_layout()
plt.show()

# Logistic Regression

lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
lr_predictions = lr_model.predict(X_test_scaled)
print("Logistic Regression Results:")
print(classification_report(y_test, lr_predictions))
print(f'Accuracy: {accuracy_score(y_test, lr_predictions) * 100:.2f}%')

# Support Vector Machine (SVC)

svc_model = LinearSVC()
svc_model.fit(X_train_scaled, y_train)
svc_predictions = svc_model.predict(X_test_scaled)
print("SVC Model Results:")
print(classification_report(y_test, svc_predictions))
print(f'Accuracy: {accuracy_score(y_test, svc_predictions) * 100:.2f}%')

# Gaussian Naive Bayes

nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
nb_predictions = nb_model.predict(X_test_scaled)
print("Naive Bayes Results:")
print(classification_report(y_test, nb_predictions))
print(f'Accuracy: {accuracy_score(y_test, nb_predictions) * 100:.2f}%')

# Decision Tree Classifier

dt_model = DecisionTreeClassifier(max_depth=1, random_state=0)
dt_model.fit(X_train_scaled, y_train)
dt_predictions = dt_model.predict(X_test_scaled)
print("Decision Tree Results:")
print(classification_report(y_test, dt_predictions))
print(f'Accuracy: {accuracy_score(y_test, dt_predictions) * 100:.2f}%')

# Random Forest Classifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(X_train_scaled, y_train)
rf_predictions = rf_model.predict(X_test_scaled)
print("Random Forest Results:")
print(classification_report(y_test, rf_predictions))
print(f'Accuracy: {accuracy_score(y_test, rf_predictions) * 100:.2f}%')

# XGBoost Classifier

xgb_model = XGBClassifier(random_state=0)
xgb_model.fit(X_train_scaled, y_train)
xgb_predictions = xgb_model.predict(X_test_scaled)
print("XGBoost Results:")
print(classification_report(y_test, xgb_predictions))
print(f'Accuracy: {accuracy_score(y_test, xgb_predictions) * 100:.2f}%')

# Accuracy Comparison

models = ['XGBoost', 'Random Forest', 'Decision Tree', 'Logistic Regression', 'SVC', 'Naive Bayes']
accuracies = [accuracy_score(y_test, xgb_predictions), accuracy_score(y_test, rf_predictions),
              accuracy_score(y_test, dt_predictions), accuracy_score(y_test, lr_predictions),
              accuracy_score(y_test, svc_predictions), accuracy_score(y_test, nb_predictions)]

results_df = pd.DataFrame({'Model': models, 'Accuracy': accuracies})

plt.figure(figsize=(7, 5))
plt.bar(results_df['Model'], results_df['Accuracy'], color='skyblue')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Performance Comparison')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.grid(axis='x')
plt.tight_layout()
plt.show()

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m851.8 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


MessageError: Error: credential propagation was unsuccessful