# Credit Card Fraud Detection Project

## Student: John Doe
## Date: December 2024

This notebook contains my machine learning project for detecting fraudulent credit card transactions.

## 1. Import Libraries

In [None]:
# importing necessary libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# for preprocessing
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# for model training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

# for saving model
import pickle

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
# Note: Download creditcard.csv from Kaggle and place in data folder
df = pd.read_csv('data/creditcard.csv')

# Check first few rows
print("First 5 rows of dataset:")
df.head()

In [None]:
# Basic info about dataset
print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()

In [None]:
# Check for missing values
print("Missing values in dataset:")
print(df.isnull().sum())

In [None]:
# Check class distribution
print("Class distribution:")
print(df['Class'].value_counts())
print("\nPercentage of fraud transactions:", df['Class'].value_counts()[1] / len(df) * 100, "%")

# visualize class distribution
plt.figure(figsize=(8, 5))
df['Class'].value_counts().plot(kind='bar')
plt.title('Distribution of Normal vs Fraud Transactions')
plt.xlabel('Class (0=Normal, 1=Fraud)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Statistical summary
print("Statistical summary of Amount column:")
df['Amount'].describe()

## 3. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

print("Features shape:", X.shape)
print("Target shape:", y.shape)

In [None]:
# Scale the Amount and Time columns
# V1-V28 are already scaled from PCA
scaler = StandardScaler()
X['Amount'] = scaler.fit_transform(X['Amount'].values.reshape(-1, 1))
X['Time'] = scaler.fit_transform(X['Time'].values.reshape(-1, 1))

print("Scaling completed for Amount and Time columns")

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)
print("\nClass distribution in training set:")
print(y_train.value_counts())

## 4. Handle Class Imbalance

Since the dataset is highly imbalanced, I'll use SMOTE to oversample the minority class

In [None]:
# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print("Original training set shape:", X_train.shape)
print("Balanced training set shape:", X_train_balanced.shape)
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_balanced).value_counts())

## 5. Model Training

I will train 3 different models and compare there performance

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
}

# Dictionary to store results
results = {}

In [None]:
# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # Train model
    model.fit(X_train_balanced, y_train_balanced)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        'model': model,
        'predictions': y_pred,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

## 6. Model Evaluation and Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': results.keys(),
    'Accuracy': [results[m]['accuracy'] for m in results.keys()],
    'Precision': [results[m]['precision'] for m in results.keys()],
    'Recall': [results[m]['recall'] for m in results.keys()],
    'F1-Score': [results[m]['f1_score'] for m in results.keys()]
})

print("\nModel Performance Comparison:")
print(comparison_df.to_string(index=False))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot for all metrics
comparison_df.set_index('Model').plot(kind='bar', ax=axes[0])
axes[0].set_title('Model Performance Comparison')
axes[0].set_ylabel('Score')
axes[0].set_xlabel('Model')
axes[0].legend(loc='best')
axes[0].set_ylim([0, 1])
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')

# Focus on F1-Score
comparison_df.set_index('Model')['F1-Score'].plot(kind='bar', ax=axes[1], color=['blue', 'green', 'orange'])
axes[1].set_title('F1-Score Comparison')
axes[1].set_ylabel('F1-Score')
axes[1].set_xlabel('Model')
axes[1].set_ylim([0, 1])
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.savefig('figures/model_comparison.png')
plt.show()

## 7. Detailed Analysis of Best Model

Based on the F1-score, let's analyze the best performing model in detail

In [None]:
# Find best model based on F1-score
best_model_name = comparison_df.loc[comparison_df['F1-Score'].idxmax(), 'Model']
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"Best Model: {best_model_name}")
print(f"F1-Score: {results[best_model_name]['f1_score']:.4f}")

In [None]:
# Confusion Matrix for best model
cm = confusion_matrix(y_test, best_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('figures/confusion_matrix.png')
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, best_predictions, target_names=['Normal', 'Fraud']))

In [None]:
# ROC Curve for all models
plt.figure(figsize=(10, 8))

for model_name, model_results in results.items():
    model = model_results['model']
    
    # Get probability predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    # Plot
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.3f})')

# Plot diagonal line
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.savefig('figures/roc_curves.png')
plt.show()

## 8. Feature Importance (for tree-based models)

In [None]:
# Check if best model has feature importance
if hasattr(best_model, 'feature_importances_'):
    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot top 10 features
    plt.figure(figsize=(10, 6))
    top_features = feature_importance.head(10)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 10 Important Features - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.savefig('figures/feature_importance.png')
    plt.show()
    
    print("Top 10 Most Important Features:")
    print(top_features.to_string(index=False))
else:
    print(f"{best_model_name} doesn't have feature importance attribute")

## 9. Save the Best Model

In [None]:
# Save the best model
model_filename = f'models/{best_model_name.lower().replace(" ", "_")}_model.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(best_model, file)
    
print(f"Model saved as: {model_filename}")

# Also save the scaler for future use
with open('models/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
    
print("Scaler saved as: models/scaler.pkl")

## 10. Test Model Loading and Prediction

In [None]:
# Load the saved model
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Test prediction on a sample
sample = X_test.iloc[0:5]
predictions = loaded_model.predict(sample)
actual = y_test.iloc[0:5]

print("Test predictions on 5 samples:")
for i in range(5):
    pred_label = "Fraud" if predictions[i] == 1 else "Normal"
    actual_label = "Fraud" if actual.iloc[i] == 1 else "Normal"
    print(f"Sample {i+1}: Predicted = {pred_label}, Actual = {actual_label}")

## 11. Conclusions

### Summary of Results:
- The dataset was highly imbalanced with only 0.17% fraud transactions
- Used SMOTE to balance the training data
- Trained 3 models: Logistic Regression, Decision Tree, and Random Forest
- Random Forest performed best with highest F1-score
- The model can successfully detect fraudulent transactions

### Future Improvements:
- Try more advanced models like XGBoost or Neural Networks
- Experiment with different sampling techniques
- Perform hyperparameter tuning
- Use cross-validation for more robust evaluation