In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("bs140513_032310.csv")

# Removing unneeded features
remove_columns = ['zipcodeOri', 'zipMerchant']
df = df.drop(columns=remove_columns)

# Feature engineering: Creating new features
df['transaction_amount_per_merchant'] = df.groupby('merchant')['amount'].transform('mean')
df['transaction_frequency'] = df.groupby('customer')['step'].transform('count')

# Label encoding categorical data
for column in ['customer', 'age', 'gender', 'merchant', 'category']:
    df[column] = LabelEncoder().fit_transform(df[column])

# Splitting data into features and target
X = df.drop('fraud', axis=1)
y = df['fraud']

# Displaying the class distribution
class_df = pd.DataFrame(df['fraud'].value_counts().rename_axis('fraud').reset_index(name='number'))
class_df['fraud'].replace({0: 'Normal', 1: 'Fraud'}, inplace=True)
fig = plt.figure()
ax = sns.barplot(x=class_df['fraud'], y=class_df['number'])
ax.bar_label(ax.containers[0], color='black')
plt.title(label='Normal vs Fraud')

# Splitting the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Applying SMOTE to the imbalanced data
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Hyperparameter tuning for each base model

# 1. RandomForestClassifier
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
}
grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1')
grid_search_rf.fit(X_res, y_res)
best_rf_model = grid_search_rf.best_estimator_

# 2. XGBClassifier
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
}
grid_search_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, cv=3, scoring='f1')
grid_search_xgb.fit(X_res, y_res)
best_xgb_model = grid_search_xgb.best_estimator_

# 3. LogisticRegression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}
grid_search_lr = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid_lr, cv=3, scoring='f1')
grid_search_lr.fit(X_res, y_res)
best_lr_model = grid_search_lr.best_estimator_

# 4. KNeighborsClassifier
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=3, scoring='f1')
grid_search_knn.fit(X_res, y_res)
best_knn_model = grid_search_knn.best_estimator_

# 5. DecisionTreeClassifier
param_grid_dt = {
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
}
grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=3, scoring='f1')
grid_search_dt.fit(X_res, y_res)
best_dt_model = grid_search_dt.best_estimator_

# GaussianNB does not have hyperparameters to tune

# List of the tuned base models
base_models = [
    ('rf', best_rf_model),
    ('xgb', best_xgb_model),
    ('nb', GaussianNB()),
    ('lr', best_lr_model),
    ('knn', best_knn_model),
    ('dt', best_dt_model),
]

# Creating the VotingClassifier with majority voting
ensemble_classifier = VotingClassifier(estimators=base_models, voting='hard')

# Fitting the ensemble classifier on SMOTE-resampled training data
ensemble_classifier.fit(X_res, y_res)

# Predicting using the ensemble model on the validation set
y_pred_val_ensemble = ensemble_classifier.predict(X_val)

# Evaluating the ensemble model's performance on validation data
accuracy_val_ensemble = accuracy_score(y_val, y_pred_val_ensemble)
f1_val_ensemble = f1_score(y_val, y_pred_val_ensemble)
precision_val_ensemble = precision_score(y_val, y_pred_val_ensemble)
recall_val_ensemble = recall_score(y_val, y_pred_val_ensemble)

print("\nEnsemble Model Performance on Validation Data:")
print("Accuracy:", accuracy_val_ensemble)
print("F1-score:", f1_val_ensemble)
print("Precision:", precision_val_ensemble)
print("Recall:", recall_val_ensemble)
conf_matrix_val_ensemble = confusion_matrix(y_val, y_pred_val_ensemble)
sns.heatmap(conf_matrix_val_ensemble, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Validation Data)')
plt.show()

# Predicting using the ensemble model on testing data
y_pred_test_ensemble = ensemble_classifier.predict(X_test)

# Evaluating the ensemble model's performance on testing data
accuracy_test_ensemble = accuracy_score(y_test, y_pred_test_ensemble)
f1_test_ensemble = f1_score(y_test, y_pred_test_ensemble)
precision_test_ensemble = precision_score(y_test, y_pred_test_ensemble)
recall_test_ensemble = recall_score(y_test, y_pred_test_ensemble)

print("\nEnsemble Model Performance on Testing Data:")
print("Accuracy:", accuracy_test_ensemble)
print("F1-score:", f1_test_ensemble)
print("Precision:", precision_test_ensemble)
print("Recall:", recall_test_ensemble)
conf_matrix_test_ensemble = confusion_matrix(y_test, y_pred_test_ensemble)
sns.heatmap(conf_matrix_test_ensemble, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Testing Data)')
plt.show()
