# Training Credit Card Fraud dataset on multiple models

**Import modules**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score,
                             recall_score, f1_score, matthews_corrcoef,
                             confusion_matrix, classification_report)
import joblib
import warnings
warnings.filterwarnings('ignore')

**Load and Prepare Data**

In [10]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'creditcardfraud' dataset.
Path to dataset files: /kaggle/input/creditcardfraud


In [13]:
print("Loading dataset...")
df = pd.read_csv(path+'/creditcard.csv')
print(df)
print("Dataset shape: ", df.shape)
print("Class Distribution: ", df['Class'].value_counts())

Loading dataset...
            Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.098698  0.3

In [14]:
X = df.drop('Class', axis=1)
y = df['Class']

In [15]:
#Split data - 70/30
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Train set: ", X_train.shape)
print("Train set: ", X_test.shape)

Train set:  (199364, 30)
Train set:  (85443, 30)


In [16]:
#Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
#Save 5000 samples for streamlit app
test_sample = pd.DataFrame(X_test_scaled[:5000], columns=X.columns)
test_sample['Class'] = y_test.iloc[:5000].values
test_sample.to_csv('test_data.csv', index=False)
print('Test sample saved for Streamlit app.')

Test sample saved for Streamlit app.


**Define Models**

In [19]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, max_depth=5,
                             eval_metric='logloss')
}

In [20]:
# 3. TRAIN MODELS AND EVALUATE
results = []

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}... ")
    print('='*50)

    #Train model
    model.fit(X_train_scaled, y_train)

    #Predict
    y_pred = model.predict(X_test_scaled)
    y_pred_prob = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, 'predict_proba') else y_pred

    #Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_prob)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)

    #Store results
    results.append({
        'Model': name,
        'Accuracy': round(accuracy,4),
        'AUC': round(auc, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1': round(f1, 4),
        'MCC': round(mcc, 4)
    })

    # Print results
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"AUC Score: {auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"MCC Score: {mcc:.4f}")

    # Save model
    joblib.dump(model, f'{name.replace(" ", "_").lower()}_model.pkl')
    print(f"Model saved.")


Training Logistic Regression... 
Accuracy:  0.9991
AUC Score: 0.9567
Precision: 0.8505
Recall:    0.6149
F1 Score:  0.7137
MCC Score: 0.7227
Model saved.

Training Decision Tree... 
Accuracy:  0.9994
AUC Score: 0.8423
Precision: 0.8790
Recall:    0.7365
F1 Score:  0.8015
MCC Score: 0.8043
Model saved.

Training KNN... 
Accuracy:  0.9994
AUC Score: 0.9188
Precision: 0.9153
Recall:    0.7297
F1 Score:  0.8120
MCC Score: 0.8170
Model saved.

Training Random Forest... 
Accuracy:  0.9995
AUC Score: 0.9682
Precision: 0.9569
Recall:    0.7500
F1 Score:  0.8409
MCC Score: 0.8469
Model saved.

Training Naive Bayes... 
Accuracy:  0.9780
AUC Score: 0.9552
Precision: 0.0604
Recall:    0.8041
F1 Score:  0.1124
MCC Score: 0.2168
Model saved.

Training XGBoost... 
Accuracy:  0.9968
AUC Score: 0.5255
Precision: 0.0429
Recall:    0.0405
F1 Score:  0.0417
MCC Score: 0.0401
Model saved.


**Save Scaler**

In [21]:
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

**Save results to CSV**

In [22]:
results_df = pd.DataFrame(results)
results_df.to_csv('model_results.csv', index=False)
print("\n" + "="*50)
print("ALL MODELS TRAINED SUCCESSFULLY.")
print("="*50)
print("\nComparison Table:")
print(results_df.to_string(index=False))
print("\nResults saved to 'model_results.csv'")


ALL MODELS TRAINED SUCCESSFULLY.

Comparison Table:
              Model  Accuracy    AUC  Precision  Recall     F1    MCC
Logistic Regression    0.9991 0.9567     0.8505  0.6149 0.7137 0.7227
      Decision Tree    0.9994 0.8423     0.8790  0.7365 0.8015 0.8043
                KNN    0.9994 0.9188     0.9153  0.7297 0.8120 0.8170
      Random Forest    0.9995 0.9682     0.9569  0.7500 0.8409 0.8469
        Naive Bayes    0.9780 0.9552     0.0604  0.8041 0.1124 0.2168
            XGBoost    0.9968 0.5255     0.0429  0.0405 0.0417 0.0401

Results saved to 'model_results.csv'
