<a href="https://colab.research.google.com/github/matidesalegn/Improved-detection-of-fraud-cases-in-e-commerce-and-bank-transactions/blob/task-2/models/model_building_and_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

### Load Processed Data
### First, let's load the processed data for both datasets:

### Processed Fraud Data with Country: ../data/processed/processed_fraud_data_with_country.csv
### Processed Credit Card Data: ../data/processed/processed_credit_card_data.csv

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
!pip install mlflow
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # Import SimpleImputer
import numpy as np

# Load processed data
fraud_data = pd.read_csv('processed_fraud_data_with_country.csv')
credit_card_data = pd.read_csv('processed_credit_card_data.csv')

# Ensure datetime columns are correctly parsed
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Feature Engineering for fraud_data
# Extract numerical features from datetime columns
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.day
fraud_data['signup_month'] = fraud_data['signup_time'].dt.month
fraud_data['signup_year'] = fraud_data['signup_time'].dt.year

fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.day
fraud_data['purchase_month'] = fraud_data['purchase_time'].dt.month
fraud_data['purchase_year'] = fraud_data['purchase_time'].dt.year

# Drop original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

# Identify categorical columns
categorical_columns = fraud_data.select_dtypes(include=['object']).columns.tolist()

# Encode categorical features
fraud_data = pd.get_dummies(fraud_data, columns=categorical_columns, drop_first=True)

# Check the structure and columns of the datasets
print("Fraud Data Columns:")
print(fraud_data.columns)
print("\nCredit Card Data Columns:")
print(credit_card_data.columns)

# Feature and target separation for fraud detection data
X_fraud = fraud_data.drop(columns=['class'])  # Features
y_fraud = fraud_data['class']                # Target

# Feature and target separation for credit card data
X_credit = credit_card_data.drop(columns=['Class'])  # Features
y_credit = credit_card_data['Class']                # Target

# Handle missing values in features
imputer = SimpleImputer(strategy='mean')
X_fraud = imputer.fit_transform(X_fraud)
X_credit = imputer.fit_transform(X_credit)

# Handle missing values in target variables
y_fraud = y_fraud.fillna(y_fraud.mode()[0])
y_credit = y_credit.fillna(y_credit.mode()[0])

# Train-test split for fraud data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Train-test split for credit card data
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)


Collecting mlflow
  Downloading mlflow-2.14.1-py3-none-any.whl (25.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.8/25.8 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

  fraud_data = pd.read_csv('processed_fraud_data_with_country.csv')


Fraud Data Columns:
Index(['user_id', 'purchase_value', 'age', 'ip_address', 'class',
       'ip_address_int', 'transaction_count', 'time_since_signup',
       'transaction_velocity', 'transaction_time_diff',
       ...
       'country_Turkmenistan_True', 'country_Ukraine_True',
       'country_United Arab Emirates_True', 'country_United Kingdom_True',
       'country_United States_True', 'country_Unknown_True',
       'country_Uruguay_True', 'country_Uzbekistan_True',
       'country_Venezuela_True', 'country_Viet Nam_True'],
      dtype='object', length=8571)

Credit Card Data Columns:
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


# Model Selection, Training, and Evaluation

### We'll use several models to compare their performance:

### Logistic Regression
### Decision Tree
### Random Forest
### Gradient Boosting
### Multi-Layer Perceptron (MLP)
### Convolutional Neural Network (CNN) (for credit card data)
### Recurrent Neural Network (RNN) or LSTM (for fraud detection data)

### Model Training and Evaluation
### Train and Evaluate Models
### Here's how you might train and evaluate models for the fraud detection dataset (X_train_fraud, y_train_fraud, X_test_fraud, y_test_fraud). Repeat a similar process for the credit card dataset.

In [2]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier(max_iter=1000)
}

# Train and evaluate models for fraud detection data
for name, model in models.items():
    with mlflow.start_run(run_name=f'{name} on Fraud Data'):
        model.fit(X_train_fraud, y_train_fraud)
        y_pred = model.predict(X_test_fraud)
        accuracy = accuracy_score(y_test_fraud, y_pred)

        # Log parameters and metrics
        mlflow.log_params({
            'model': name,
            'dataset': 'Fraud Data',
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)

        # Save the model
        mlflow.sklearn.log_model(model, f'{name}_model')

        # Print and log classification report
        print(f"Model: {name}")
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test_fraud, y_pred))
        print("="*60)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: Logistic Regression
Accuracy: 0.9105263157894737
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1557
           1       0.00      0.00      0.00       153

    accuracy                           0.91      1710
   macro avg       0.46      0.50      0.48      1710
weighted avg       0.83      0.91      0.87      1710

Model: Decision Tree
Accuracy: 0.9514619883040936
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1557
           1       0.99      0.46      0.63       153

    accuracy                           0.95      1710
   macro avg       0.97      0.73      0.80      1710
weighted avg       0.95      0.95      0.94      1710

Model: Random Forest
Accuracy: 0.952046783625731
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1557
           1       1.00      0.46      0.63       153

    accuracy               

# MLOps Steps
### Versioning and Experiment Tracking with MLflow
### For versioning and experiment tracking, we'll integrate MLflow. Ensure you have MLflow installed (pip install mlflow) and set up in your environment.

In [3]:
# Train and evaluate models for credit card data
for name, model in models.items():
    with mlflow.start_run(run_name=f'{name} on Credit Card Data'):
        model.fit(X_train_credit, y_train_credit)
        y_pred = model.predict(X_test_credit)
        accuracy = accuracy_score(y_test_credit, y_pred)

        # Log parameters and metrics
        mlflow.log_params({
            'model': name,
            'dataset': 'Credit Card Data',
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)

        # Save the model
        mlflow.sklearn.log_model(model, f'{name}_model')

        # Print and log classification report
        print(f"Model: {name}")
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test_credit, y_pred))
        print("="*60)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model: Logistic Regression
Accuracy: 0.9972533760585947
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4349
         1.0       0.79      0.55      0.65        20

    accuracy                           1.00      4369
   macro avg       0.89      0.77      0.82      4369
weighted avg       1.00      1.00      1.00      4369

Model: Decision Tree
Accuracy: 0.9988555733577478
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4349
         1.0       0.89      0.85      0.87        20

    accuracy                           1.00      4369
   macro avg       0.95      0.92      0.94      4369
weighted avg       1.00      1.00      1.00      4369

Model: Random Forest
Accuracy: 0.9990844586861982
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      4349
         1.0       1.00      0.80      0.89        20

    accuracy              