<a href="https://colab.research.google.com/github/matidesalegn/Improved-detection-of-fraud-cases-in-e-commerce-and-bank-transactions/blob/task-2/models/model_building_and_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

### Load Processed Data
### First, let's load the processed data for both datasets:

### Processed Fraud Data with Country: ../data/processed/processed_fraud_data_with_country.csv
### Processed Credit Card Data: ../data/processed/processed_credit_card_data.csv

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from scipy import sparse
import joblib
import os
import logging

# Configure logger
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Create directories if not exist
os.makedirs('models/fraud', exist_ok=True)
os.makedirs('models/credit', exist_ok=True)

logger.info("Directories created for saving models")

# Load processed data
fraud_data = pd.read_csv('../data/processed/processed_fraud_data_with_country.csv')
credit_card_data = pd.read_csv('../data/processed/processed_credit_card_data.csv')

# Ensure datetime columns are correctly parsed
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Feature Engineering for fraud_data
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.day
fraud_data['signup_month'] = fraud_data['signup_time'].dt.month
fraud_data['signup_year'] = fraud_data['signup_time'].dt.year

fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.day
fraud_data['purchase_month'] = fraud_data['purchase_time'].dt.month
fraud_data['purchase_year'] = fraud_data['purchase_time'].dt.year

# Drop original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

# Identify categorical columns
categorical_columns = fraud_data.select_dtypes(include=['object']).columns.tolist()

# Frequency Encoding of categorical features
for col in categorical_columns:
    freq_encoding = fraud_data[col].value_counts().to_dict()
    fraud_data[col] = fraud_data[col].map(freq_encoding)

# Convert data types to more memory efficient types
for col in fraud_data.select_dtypes(include=['int64']).columns:
    fraud_data[col] = fraud_data[col].astype('int32')
for col in fraud_data.select_dtypes(include=['float64']).columns:
    fraud_data[col] = fraud_data[col].astype('float32')

# Feature and target separation for fraud detection data
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

# Feature and target separation for credit card data
X_credit = credit_card_data.drop(columns=['Class'])
y_credit = credit_card_data['Class']

# Handle missing values in features
imputer = SimpleImputer(strategy='mean')
X_fraud = imputer.fit_transform(X_fraud)
X_credit = imputer.fit_transform(X_credit)

# Convert X_fraud to a sparse matrix
X_fraud_sparse = sparse.csr_matrix(X_fraud)

# Handle missing values in target variables
y_fraud = y_fraud.fillna(y_fraud.mode()[0])
y_credit = y_credit.fillna(y_credit.mode()[0])

# Train-test split for fraud data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud_sparse, y_fraud, test_size=0.2, random_state=42)

# Train-test split for credit card data
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

2024-06-24 22:27:55,754 - INFO - Directories created for saving models


# Model Selection, Training, and Evaluation

### We'll use several models to compare their performance:

### Logistic Regression
### Decision Tree
### Random Forest
### Gradient Boosting
### Multi-Layer Perceptron (MLP)
### Convolutional Neural Network (CNN) (for credit card data)
### Recurrent Neural Network (RNN) or LSTM (for fraud detection data)

### Model Training and Evaluation
### Train and Evaluate Models
### Here's how you might train and evaluate models for the fraud detection dataset (X_train_fraud, y_train_fraud, X_test_fraud, y_test_fraud). Repeat a similar process for the credit card dataset.

In [12]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier(max_iter=1000)
}

# Function to train and evaluate models
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, dataset_name, model_name):
    with mlflow.start_run(run_name=f'{model_name} on {dataset_name}'):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Log parameters and metrics
        mlflow.log_params({
            'model': model_name,
            'dataset': dataset_name,
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)

        # Save the model with mlflow
        mlflow.sklearn.log_model(model, f'{model_name}_model')

        # Form path without spaces
        dataset_path = dataset_name.lower().replace(' ', '_')
        model_path = model_name.lower().replace(' ', '_').replace(' ', '_')
        local_model_path = f"models/{dataset_path}/{model_path}_model.joblib"

        # Ensure directory exists
        os.makedirs(f"models/{dataset_path}", exist_ok=True)
        joblib.dump(model, local_model_path)
        logger.info(f"Model {model_name} for {dataset_name} saved to {local_model_path}")

        # Print and log classification report
        report = classification_report(y_test, y_pred)
        logger.info(f"Model: {model_name} on {dataset_name}")
        logger.info(f"Accuracy: {accuracy}")
        logger.info(report)
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy}")
        print(report)
        print("="*60)

# Train and evaluate models for fraud detection data
for name, model in models.items():
    train_and_evaluate_model(model, X_train_fraud, y_train_fraud, X_test_fraud, y_test_fraud, 'Fraud Data', name)

# Train and evaluate models for credit card data
for name, model in models.items():
    train_and_evaluate_model(model, X_train_credit, y_train_credit, X_test_credit, y_test_credit, 'Credit Card Data', name)

2024-06-24 22:28:28,516 - INFO - Model Logistic Regression for Fraud Data saved to models/fraud_data/logistic_regression_model.joblib
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024-06-24 22:28:28,542 - INFO - Model: Logistic Regression on Fraud Data
2024-06-24 22:28:28,542 - INFO - Accuracy: 0.9057009562253913
2024-06-24 22:28:28,542 - INFO -               precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       0.00      0.00      0.00      2850

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223



Model: Logistic Regression
Accuracy: 0.9057009562253913
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27373
           1       0.00      0.00      0.00      2850

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223



2024-06-24 22:29:01,772 - INFO - Model Decision Tree for Fraud Data saved to models/fraud_data/decision_tree_model.joblib
2024-06-24 22:29:01,807 - INFO - Model: Decision Tree on Fraud Data
2024-06-24 22:29:01,807 - INFO - Accuracy: 0.9151970353704133
2024-06-24 22:29:01,807 - INFO -               precision    recall  f1-score   support

           0       0.96      0.95      0.95     27373
           1       0.55      0.60      0.57      2850

    accuracy                           0.92     30223
   macro avg       0.75      0.77      0.76     30223
weighted avg       0.92      0.92      0.92     30223



Model: Decision Tree
Accuracy: 0.9151970353704133
              precision    recall  f1-score   support

           0       0.96      0.95      0.95     27373
           1       0.55      0.60      0.57      2850

    accuracy                           0.92     30223
   macro avg       0.75      0.77      0.76     30223
weighted avg       0.92      0.92      0.92     30223



2024-06-24 22:33:24,603 - INFO - Model Random Forest for Fraud Data saved to models/fraud_data/random_forest_model.joblib
2024-06-24 22:33:24,679 - INFO - Model: Random Forest on Fraud Data
2024-06-24 22:33:24,681 - INFO - Accuracy: 0.9564239155609966
2024-06-24 22:33:24,681 - INFO -               precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223



Model: Random Forest
Accuracy: 0.9564239155609966
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223



2024-06-24 22:34:36,584 - INFO - Model Gradient Boosting for Fraud Data saved to models/fraud_data/gradient_boosting_model.joblib
2024-06-24 22:34:36,628 - INFO - Model: Gradient Boosting on Fraud Data
2024-06-24 22:34:36,628 - INFO - Accuracy: 0.9564570029447772
2024-06-24 22:34:36,628 - INFO -               precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223



Model: Gradient Boosting
Accuracy: 0.9564570029447772
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223



2024-06-24 22:34:58,237 - INFO - Model MLP for Fraud Data saved to models/fraud_data/mlp_model.joblib
2024-06-24 22:34:58,261 - INFO - Model: MLP on Fraud Data
2024-06-24 22:34:58,261 - INFO - Accuracy: 0.7443337855275783
2024-06-24 22:34:58,266 - INFO -               precision    recall  f1-score   support

           0       0.94      0.77      0.84     27373
           1       0.19      0.52      0.28      2850

    accuracy                           0.74     30223
   macro avg       0.56      0.64      0.56     30223
weighted avg       0.87      0.74      0.79     30223



Model: MLP
Accuracy: 0.7443337855275783
              precision    recall  f1-score   support

           0       0.94      0.77      0.84     27373
           1       0.19      0.52      0.28      2850

    accuracy                           0.74     30223
   macro avg       0.56      0.64      0.56     30223
weighted avg       0.87      0.74      0.79     30223



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2024-06-24 22:35:26,992 - INFO - Model Logistic Regression for Credit Card Data saved to models/credit_card_data/logistic_regression_model.joblib
2024-06-24 22:35:27,047 - INFO - Model: Logistic Regression on Credit Card Data
2024-06-24 22:35:27,047 - INFO - Accuracy: 0.9991893701758714
2024-06-24 22:35:27,047 - INFO -               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.85      0.59      0.70        90

    accuracy                           1.00     56746
   macro avg       0.93      0.79      0.85     56746
weighted avg       1.00      1.0

Model: Logistic Regression
Accuracy: 0.9991893701758714
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.85      0.59      0.70        90

    accuracy                           1.00     56746
   macro avg       0.93      0.79      0.85     56746
weighted avg       1.00      1.00      1.00     56746



2024-06-24 22:35:57,562 - INFO - Model Decision Tree for Credit Card Data saved to models/credit_card_data/decision_tree_model.joblib
2024-06-24 22:35:57,628 - INFO - Model: Decision Tree on Credit Card Data
2024-06-24 22:35:57,628 - INFO - Accuracy: 0.9989955239135798
2024-06-24 22:35:57,628 - INFO -               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.67      0.73      0.70        90

    accuracy                           1.00     56746
   macro avg       0.83      0.87      0.85     56746
weighted avg       1.00      1.00      1.00     56746



Model: Decision Tree
Accuracy: 0.9989955239135798
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.67      0.73      0.70        90

    accuracy                           1.00     56746
   macro avg       0.83      0.87      0.85     56746
weighted avg       1.00      1.00      1.00     56746



2024-06-24 22:40:50,335 - INFO - Model Random Forest for Credit Card Data saved to models/credit_card_data/random_forest_model.joblib
2024-06-24 22:40:50,390 - INFO - Model: Random Forest on Credit Card Data
2024-06-24 22:40:50,391 - INFO - Accuracy: 0.9995241955380115
2024-06-24 22:40:50,391 - INFO -               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.98      0.71      0.83        90

    accuracy                           1.00     56746
   macro avg       0.99      0.86      0.91     56746
weighted avg       1.00      1.00      1.00     56746



Model: Random Forest
Accuracy: 0.9995241955380115
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.98      0.71      0.83        90

    accuracy                           1.00     56746
   macro avg       0.99      0.86      0.91     56746
weighted avg       1.00      1.00      1.00     56746



2024-06-24 22:46:34,957 - INFO - Model Gradient Boosting for Credit Card Data saved to models/credit_card_data/gradient_boosting_model.joblib
2024-06-24 22:46:35,006 - INFO - Model: Gradient Boosting on Credit Card Data
2024-06-24 22:46:35,006 - INFO - Accuracy: 0.9992951045007578
2024-06-24 22:46:35,006 - INFO -               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.89      0.63      0.74        90

    accuracy                           1.00     56746
   macro avg       0.95      0.82      0.87     56746
weighted avg       1.00      1.00      1.00     56746



Model: Gradient Boosting
Accuracy: 0.9992951045007578
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.89      0.63      0.74        90

    accuracy                           1.00     56746
   macro avg       0.95      0.82      0.87     56746
weighted avg       1.00      1.00      1.00     56746



2024-06-24 22:47:22,855 - INFO - Model MLP for Credit Card Data saved to models/credit_card_data/mlp_model.joblib
2024-06-24 22:47:22,901 - INFO - Model: MLP on Credit Card Data
2024-06-24 22:47:22,902 - INFO - Accuracy: 0.9982906284143376
2024-06-24 22:47:22,902 - INFO -               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.35      0.09      0.14        90

    accuracy                           1.00     56746
   macro avg       0.67      0.54      0.57     56746
weighted avg       1.00      1.00      1.00     56746



Model: MLP
Accuracy: 0.9982906284143376
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.35      0.09      0.14        90

    accuracy                           1.00     56746
   macro avg       0.67      0.54      0.57     56746
weighted avg       1.00      1.00      1.00     56746



# MLOps Steps
### Versioning and Experiment Tracking with MLflow
### For versioning and experiment tracking, we'll integrate MLflow. Ensure you have MLflow installed (pip install mlflow) and set up in your environment.

In [3]:
# Train and evaluate models for credit card data
for name, model in models.items():
    with mlflow.start_run(run_name=f'{name} on Credit Card Data'):
        model.fit(X_train_credit, y_train_credit)
        y_pred = model.predict(X_test_credit)
        accuracy = accuracy_score(y_test_credit, y_pred)

        # Log parameters and metrics
        mlflow.log_params({
            'model': name,
            'dataset': 'Credit Card Data',
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)

        # Save the model
        mlflow.sklearn.log_model(model, f'{name}_model')

        # Print and log classification report
        print(f"Model: {name}")
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test_credit, y_pred))
        print("="*60)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Logistic Regression
Accuracy: 0.9972533760585947
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4349
         1.0       0.79      0.55      0.65        20

    accuracy                           1.00      4369
   macro avg       0.89      0.77      0.82      4369
weighted avg       1.00      1.00      1.00      4369

Model: Decision Tree
Accuracy: 0.9988555733577478
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4349
         1.0       0.89      0.85      0.87        20

    accuracy                           1.00      4369
   macro avg       0.95      0.92      0.94      4369
weighted avg       1.00      1.00      1.00      4369

Model: Random Forest
Accuracy: 0.9990844586861982
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4349
         1.0       1.00      0.80      0.89        20

    accuracy              