In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

# Load the dataset (uncomment if loading from CSV)
df = pd.read_csv('synthetic_fraud_dataset_india.csv')

# If using DataFrame directly from previous step, comment the line above and use:
# df = <your DataFrame from previous code>

# --- Preprocessing ---
# Convert transaction_time to datetime and extract features
df['transaction_time'] = pd.to_datetime(df['transaction_time'])
df['hour'] = df['transaction_time'].dt.hour
df['day_of_week'] = df['transaction_time'].dt.dayofweek

# Drop unnecessary columns for modeling
df = df.drop(columns=['transaction_id', 'transaction_time'])  # transaction_id is just an identifier

# Encode categorical variables
label_encoders = {}
categorical_cols = ['transaction_type', 'location', 'device_id', 'ip_address']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for potential future use

# Define features (X) and target (y)
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize numerical features
scaler = StandardScaler()
numerical_cols = ['transaction_amount', 'user_id', 'hour', 'day_of_week']  # Adjust based on your needs
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# --- Model Training ---
# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# --- Evaluation ---
# Predict on the test set
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

# Print evaluation metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

# --- Feature Importance ---
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values(by='importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

# --- Save the Model ---
# Save the trained model and scaler for future use
joblib.dump(rf_model, 'fraud_detection_rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("\nModel and scaler saved as 'fraud_detection_rf_model.pkl' and 'scaler.pkl'")

# Optional: Save label encoders if needed for deployment
for col, le in label_encoders.items():
    joblib.dump(le, f'label_encoder_{col}.pkl')

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1000
           1       0.87      0.88      0.87      1000

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000


Confusion Matrix:
[[873 127]
 [125 875]]

ROC-AUC Score:
0.9427795000000001

Feature Importance:
              feature  importance
1  transaction_amount    0.418414
7                hour    0.114659
4           device_id    0.111175
5          ip_address    0.109755
0             user_id    0.103060
3            location    0.050907
2    transaction_type    0.037421
8         day_of_week    0.031113
6           is_mobile    0.023496

Model and scaler saved as 'fraud_detection_rf_model.pkl' and 'scaler.pkl'
