### Importing Libraries & Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import plotly.graph_objs as go
from sklearn import linear_model, ensemble, metrics, model_selection, feature_selection
from imblearn.over_sampling import SMOTE, SVMSMOTE
from sklearn.naive_bayes import GaussianNB
import mlflow

In [2]:
# Read processed data in csv
processed_data = pd.read_csv('transactions_processed.csv').\
                                    iloc[:, 1:].drop('customer_id', axis = 1)
processed_data.head()

Unnamed: 0,amt,city,Hour,category,city_pop,zip,state,Day,is_hour_23,DayOfWeek,Month,gender,is_hour_1,is_business_day,is_hour_3,is_hour_2,is_day_12,is_day_25,is_payment_day_20,is_fraud
0,793.81,0.648196,23,1.121869,19408.0,82501.0,0.066274,29,1,2,5,0.067809,0,1,0,0,0,0,0,1.0
1,224.44,0.0,17,-0.76249,1966.0,3774.0,0.568999,14,0,6,4,0.067809,0,0,0,0,0,0,0,0.0
2,139.71,0.0,13,-0.910227,47772.0,44223.0,-0.373098,5,0,6,5,0.067809,0,0,0,0,0,0,0,0.0
3,74.98,0.0,20,-0.76249,1304.0,12092.0,0.248304,12,0,1,2,-0.060481,0,1,0,0,1,0,0,0.0
4,777.94,0.155719,20,0.918497,493806.0,35229.0,0.376881,27,0,5,4,0.067809,0,0,0,0,0,0,0,1.0


### Define MVP

In [7]:
with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the data context to indicate that the
    # dataset is used for model training and testing
    mlflow.log_artifact(artifact_path = "data", local_path = 'transactions_processed.csv')

    # Select data for training and testing the models
    x = processed_data.drop(['is_fraud'], axis = 1)
    y = processed_data.is_fraud

    # Split the data into training data and testing data
    x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2, random_state = 2)

    # SMOTE (Synthetic Minority Over-sampling Technique) for balancing classes
    smote = SVMSMOTE()
    x_train, y_train = smote.fit_resample(x_train, y_train)

    #Create dictionary of selected models for experimentation
    selected_models = {
        'Logistic Regression': linear_model.LogisticRegression(),
        'Ridge Classifier': linear_model.RidgeClassifier(),
        'Naive Bayes': GaussianNB(),
    }

    results = []

    for name, model in selected_models.items():
        # Perform 5-fold cross-validation for accuracy, precision, and recall
        # Define the metrics to be computed
        scoring = ['accuracy', 'precision', 'recall']
        results_cv = model_selection.cross_validate(model, x_train, y_train, cv = 5, scoring = scoring)

        # Log the model on MLFlow
        mlflow.sklearn.log_model(model, artifact_path = f'models/{name.replace(" ", "_").lower()}')

        # Calculate the mean scores across the folds
        mean_accuracy = np.mean(results_cv['test_accuracy'])
        mean_precision = np.mean(results_cv['test_precision'])
        mean_recall = np.mean(results_cv['test_recall'])

        # Log metrics on MLFlow
        mlflow.log_metric(f"{name}_accuracy", mean_accuracy)
        mlflow.log_metric(f"{name}_precision", mean_precision)
        mlflow.log_metric(f"{name}_recall", mean_recall)

        # Append the results to the list for comparison
        results.append({
            'Model': name,
            'Accuracy': mean_accuracy,
            'Precision': mean_precision,
            'Recall': mean_recall
        })

    # Convert the list of results to a pandas DataFrame
    results_df = pd.DataFrame(results)
mlflow.end_run()


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

### Comparison Model's Results

In [6]:
fig = go.Figure()

# Add bar for each metric
fig.add_trace(go.Bar(
    x=results_df['Model'],
    y=results_df['Accuracy'],
    name='Accuracy',
    marker_color='#34495e',
    text=['Accuracy']*len(results_df),
    hovertemplate = '<b>%{x}</b><br>%{text}: %{y:.2f}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=results_df['Model'],
    y=results_df['Precision'],
    name='Precision',
    marker_color='#95a5a6',
    text=['Precision']*len(results_df),
    hovertemplate = '<b>%{x}</b><br>%{text}: %{y:.2f}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=results_df['Model'],
    y=results_df['Recall'],
    name='Recall',
    marker_color='#e74c3c',
    text=['Recall']*len(results_df),
    hovertemplate = '<b>%{x}</b><br>%{text}: %{y:.2f}<extra></extra>'
))

fig.update_layout(
    title='Model Comparison',
    xaxis_title="Model",
    yaxis_title="Score",
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinate
)

fig.show()