In [None]:
import pandas as pd 
import json
import numpy as np

In [None]:
df =  pd.read_pickle('fraud_data_2.pkl')

In [None]:
df.head()

In [None]:
df.columns

Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [None]:
lr_df = df.copy()
lr_df = lr_df[['category', 'trans_day', 'trans_month', 'trans_year', 'amt_zscore', 'amt_deviation', 'rolling_mean_amt', 'daily_txn_count', 'amt', 'hourly_txn_count', 'trans_dayofweek', 'trans_hour',
               'is_fraud', 'city_pop', 'gender', 'dob_yr', 'cc_transaction_count', 'num_credit_cards']]          


In [None]:
lr_df = pd.get_dummies(lr_df, columns=['category', 'gender'], drop_first=True)

In [None]:
lr_df.dtypes

In [None]:

XLR = lr_df.drop(columns=['is_fraud'])
yLR = lr_df['is_fraud']


X_train, X_test, y_train, y_test = train_test_split(XLR, yLR, test_size=0.2, random_state=42, stratify=yLR)

# scale features for logistic regression
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=XLR.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=XLR.columns)

In [None]:
# NAIVE MODEL

log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

# performance
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Coefficient': np.abs(log_reg.coef_[0])}).reset_index()
feature_importance = feature_importance.sort_values(by='Coefficient', ascending=False)

print(feature_importance)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
rf_df = df.copy()

rf_df = rf_df.drop(columns=['trans_date_trans_time', 'cc_num','state', 'trans_num', 'first_last'])

rf_df.dtypes

In [None]:

# apply label encoding to categorical columns
categorical_cols = ['merchant', 'category', 'gender', 'city', 'job']
for col in categorical_cols:
    rf_df[col] = LabelEncoder().fit_transform(rf_df[col])  

# fit model
XRF = rf_df.drop(columns=['is_fraud'])  
yRF = rf_df['is_fraud']  
X_train_RF, X_test_RF, y_train_RF, y_test_RF = train_test_split(XRF, yRF, test_size=0.2, random_state=42, stratify=yRF)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') 
rf_model.fit(X_train_RF, y_train_RF)
y_pred_RF = rf_model.predict(X_test_RF)

# performance
print("Accuracy:", accuracy_score(y_test_RF, y_pred_RF))
print(classification_report(y_test_RF, y_pred_RF))

In [None]:
feature_importance = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train_RF.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Feature Importance for Random Forest model')
plt.gca().invert_yaxis()  
plt.show()

print(feature_importance_df.head(10))

In [None]:

# creating another random forest model with most important features 
rf_df_2 = df.copy()
rf_df_2 = rf_df_2[['rolling_mean_amt', 'amt', 'amt_deviation', 'amt_zscore', 'trans_hour', 'category','is_fraud']]

# encode category column
rf_df_2['category'] = LabelEncoder().fit_transform(rf_df_2['category'])  


X_RF2 = rf_df_2.drop(columns=['is_fraud'])
y_RF2 = rf_df_2['is_fraud']
X_train_RF2, X_test_RF2, y_train_RF2, y_test_RF2 = train_test_split(X_RF2, y_RF2, test_size=0.2, random_state=42, stratify=y_RF2)

rf_model_2 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model_2.fit(X_train_RF2, y_train_RF2)


y_pred_RF2 = rf_model_2.predict(X_test_RF2)

# performance
print("Accuracy:", accuracy_score(y_test_RF2, y_pred_RF2))
print(classification_report(y_test_RF2, y_pred_RF2))


In [None]:
# get feature importance
feature_importance = rf_model_2.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train_RF2.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# graph importance
plt.figure(figsize=(12, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Feature Importance for 2nd Random Forest model')
plt.gca().invert_yaxis() 
plt.show()

# print the feature importance
print(feature_importance_df)

In [None]:

# now apply cross validation to this model 
# use stratified cross validation due to imbalanced classes
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)


fold_accuracies = []
fold_reports = []


for fold, (train_idx, test_idx) in enumerate(kf.split(X_RF2, y_RF2), 1):
    print(f"\nTraining Fold {fold}...\n")

    # Split data
    X_train, X_test = X_RF2.iloc[train_idx], X_RF2.iloc[test_idx]
    y_train, y_test = y_RF2.iloc[train_idx], y_RF2.iloc[test_idx]

    # Train 
    rf_model_cv = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf_model_cv.fit(X_train, y_train)

    # Predict 
    y_pred = rf_model_cv.predict(X_test)

    # Evaluate performance
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    fold_accuracies.append(acc)
    fold_reports.append(report)

    print(f"Fold {fold} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

# Final mean
print(f"\nAverage Accuracy Across Folds: {np.mean(fold_accuracies):.4f}")

CatBoost

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve

In [None]:
catboost_df = df.copy()
catboost_df=catboost_df.drop(columns=['cc_num', 'trans_num', 'state', 'first_last'])
catboost_df.dtypes

In [None]:
cat_features = ['merchant', 'category', 'gender', 'city', 'job']
X_C = catboost_df.drop(columns=['is_fraud'])
y_C = catboost_df['is_fraud']
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split(X_C, y_C, test_size=0.2)

# Train 
cat_model = CatBoostClassifier(n_estimators=100, cat_features=cat_features, verbose=0)
cat_model.fit(X_train_C, y_train_C)

y_pred_C = cat_model.predict(X_test_C)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test_C, y_pred_C))


In [None]:
# get feature importance
feature_importance = cat_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train_C.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# graph importance
plt.figure(figsize=(12, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Feature Importance for CatBoost model')
plt.gca().invert_yaxis() 
plt.show()

# print the feature importance
print(feature_importance_df)

In [None]:
catboost_df_2 = df.copy()
catboost_df_2 = catboost_df_2[['amt','category','trans_hour','dob_yr','daily_txn_count','amt_zscore','rolling_mean_amt','hourly_txn_count','city_pop','city','is_fraud','amt_deviation', 'cc_transaction_count']]
cat_features = ['category', 'city']
X_C2 = catboost_df_2.drop(columns=['is_fraud'])
y_C2 = catboost_df_2['is_fraud']


In [None]:
# now apply cross validation to this model 


# use stratified cross validation due to imbalanced classes
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


fold_accuracies = []
fold_reports = []


for fold, (train_idx, test_idx) in enumerate(kf.split(X_C2, y_C2), 1):
    print(f"\nTraining Fold {fold}...\n")

    # Split data
    X_train, X_test = X_C2.iloc[train_idx], X_C2.iloc[test_idx]
    y_train, y_test = y_C2.iloc[train_idx], y_C2.iloc[test_idx]

    # Train 
    cat_model_cv = CatBoostClassifier(n_estimators=200, cat_features=cat_features, verbose=0)
    cat_model_cv.fit(X_train, y_train)

    # Predict 
    y_pred = cat_model_cv.predict(X_test)

    # Evaluate performance
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    fold_accuracies.append(acc)
    fold_reports.append(report)

    print(f"Fold {fold} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

# Final mean
print(f"\nAverage Accuracy Across Folds: {np.mean(fold_accuracies):.4f}")

In [None]:

# Feature Importance
feature_importance = cat_model_cv.get_feature_importance()
feature_importance_df = pd.DataFrame({'Feature': X_C2.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot 
plt.figure(figsize=(12, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance Score')
plt.ylabel('Feature Name')
plt.title('Feature Importance - CatBoost')
plt.gca().invert_yaxis()  # Highest importance at the top
plt.show()


print("\nTop 10 Features:")
print(feature_importance_df)


In [None]:

cm = confusion_matrix(y_test, y_pred)

# plot matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Not Fraud", "Fraud"], yticklabels=["Not Fraud", "Fraud"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:


# Get probabilities
y_probs = cat_model_cv.predict_proba(X_test)[:, 1]

precision, recall, _ = precision_recall_curve(y_test, y_probs)

# plot
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
# get roc auc score
y_pred_probs = cat_model_cv.predict_proba(X_test)[:, 1]  
roc_auc = roc_auc_score(y_test, y_pred_probs)  

print(f"ROC AUC Score: {roc_auc:.4f}")


In [None]:

# Plot roc curve

fpr, tpr, _ = roc_curve(y_test, y_pred_probs)

plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.5f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Classifier (AUC = 0.5)')

plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve")
plt.legend()
plt.show()
