## Machine Learning Model for Fraud Detection

In [1]:
import pandas as pd
import numpy as np
import pickle

# Preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Model Selection and Cross-Validation
from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
    GridSearchCV,
)

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE

In [2]:
fraud = pd.read_csv("complete_dataset.csv")
fraud.columns

Index(['TransactionID', 'Timestamp', 'MerchantID', 'Amount', 'CustomerID',
       'TransactionAmount', 'AnomalyScore', 'FraudIndicator', 'Category',
       'MerchantName', 'MerchantLocation', 'CustomerName', 'CustomerAge',
       'CustomerAddress', 'AccountBalance', 'LastLogin', 'SuspiciousFlag'],
      dtype='object')

In [3]:
columns_to_be_dropped = [
    "TransactionID",
    "MerchantID",
    "CustomerID",
    "CustomerName",
    "MerchantName",
    "MerchantLocation",
    "CustomerAddress",
]

In [4]:
fraud1 = fraud.drop(columns_to_be_dropped, axis=1)
fraud1.columns

Index(['Timestamp', 'Amount', 'TransactionAmount', 'AnomalyScore',
       'FraudIndicator', 'Category', 'CustomerAge', 'AccountBalance',
       'LastLogin', 'SuspiciousFlag'],
      dtype='object')

In [5]:
fraud1["FraudIndicator"].value_counts()

FraudIndicator
0    955
1     45
Name: count, dtype: int64

This dataset is very imbalanced as the number of cases which are fraudulent are very few. Thus, the models would not be able to predict these cases very accurately.

#### Feature Engineering

In [7]:
# converting the TimeStamp to a datetime format
fraud1["Timestamp"] = pd.to_datetime(fraud1["Timestamp"])
fraud1["LastLogin"] = pd.to_datetime(fraud1["LastLogin"])
print(fraud1.dtypes)

Timestamp            datetime64[ns]
Amount                      float64
TransactionAmount           float64
AnomalyScore                float64
FraudIndicator                int64
Category                     object
CustomerAge                   int64
AccountBalance              float64
LastLogin            datetime64[ns]
SuspiciousFlag                int64
dtype: object


In [None]:
fraud1["gap"] = (fraud1["Timestamp"] - fraud1["LastLogin"]).dt.days.abs()

In [10]:
# Extract useful time-based features
fraud1["Hour"] = fraud1["Timestamp"].dt.hour
fraud1["Day"] = fraud1["Timestamp"].dt.day
fraud1["Month"] = fraud1["Timestamp"].dt.month
fraud1["Weekday"] = fraud1["Timestamp"].dt.weekday
fraud1["Year"] = fraud1["Timestamp"].dt.year

In [11]:
X = fraud1.drop(["FraudIndicator", "LastLogin", "Timestamp"], axis=1)
y = fraud1["FraudIndicator"]

In [12]:
# initializing LabelEncoder
label_encoder = LabelEncoder()

# fit and transform the Category column
X["Category"] = label_encoder.fit_transform(X["Category"])
X.head(10)

Unnamed: 0,Amount,TransactionAmount,AnomalyScore,Category,CustomerAge,AccountBalance,SuspiciousFlag,gap,Hour,Day,Month,Weekday,Year
0,55.530334,79.413607,0.686699,2,50,2869.689912,0,951,0,1,1,5,2022
1,12.88118,12.053087,0.081749,1,46,9527.947107,0,26,1,1,1,5,2022
2,50.176322,33.310357,0.023857,4,34,9288.355525,0,954,2,1,1,5,2022
3,41.634001,46.121117,0.876994,4,33,5588.049942,0,795,3,1,1,5,2022
4,78.122853,54.051618,0.034059,2,18,7324.785332,0,945,4,1,1,5,2022
5,86.947084,34.545138,0.121173,0,45,3152.247787,0,203,5,1,1,5,2022
6,51.147096,55.383113,0.109892,0,25,9253.478917,0,310,6,1,1,5,2022
7,56.163984,17.855878,0.780534,4,27,8765.035861,0,692,7,1,1,5,2022
8,37.182412,75.659944,0.010471,3,20,8935.007146,0,346,8,1,1,5,2022
9,17.245409,67.931879,0.029376,2,55,5541.197921,0,573,9,1,1,5,2022


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
# checking the sizes
X_train.shape, y_test.shape

((800, 13), (200,))

#### Logistic Regression Model

In [15]:
log_mod = LogisticRegression()

log_mod.fit(X_train, y_train)

y_pred = log_mod.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       192
           1       0.00      0.00      0.00         8

    accuracy                           0.96       200
   macro avg       0.48      0.50      0.49       200
weighted avg       0.92      0.96      0.94       200



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features to [0, 1] range
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE for oversampling
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)


# Define FROST function
def generate_frost_samples(X_minority, initial_feature_index, k=5, m=1.5):
    initial_feature_values = X_minority[:, initial_feature_index]
    similarity_matrix = 1 / (
        1 + np.abs(initial_feature_values[:, np.newaxis] - initial_feature_values)
    )
    k_nearest_indices = np.argsort(similarity_matrix, axis=1)[:, -k:]
    synthetic_samples_initial = []
    for i in range(len(initial_feature_values)):
        for j in k_nearest_indices[i]:
            synthetic_value = initial_feature_values[i] + m * (
                initial_feature_values[j] - initial_feature_values[i]
            )
            synthetic_sample = np.copy(X_minority[i])
            synthetic_sample[initial_feature_index] = synthetic_value
            synthetic_samples_initial.append(synthetic_sample)
    return np.array(synthetic_samples_initial)


# Apply FROST for oversampling
initial_feature_index = 0  # Choose the index of the initial feature to oversample
X_train_frost = generate_frost_samples(
    X_train_scaled[y_train == 1], initial_feature_index, k=5, m=1.5
)

# Combine original and synthetic samples
X_train_combined = np.vstack((X_train_scaled, X_train_frost))
y_train_combined = np.concatenate((y_train, np.ones(len(X_train_frost))))

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Define the number of folds for k-fold cross-validation
k_folds = KFold(n_splits=5)

# Perform cross-validation and calculate the scores for SMOTE
scores_smote = cross_val_score(clf, X_train_smote, y_train_smote, cv=k_folds)

# Perform cross-validation and calculate the scores for FROST
scores_frost = cross_val_score(clf, X_train_combined, y_train_combined, cv=k_folds)

# Print the cross-validation scores for each fold
print("SMOTE Cross Validation Scores: ", scores_smote)
print("FROST Cross Validation Scores: ", scores_frost)

# Print the average cross-validation score
print("Average SMOTE CV Score: ", scores_smote.mean())
print("Average FROST CV Score: ", scores_frost.mean())

SMOTE Cross Validation Scores:  [0.85901639 0.87868852 0.9442623  0.93114754 0.86842105]
FROST Cross Validation Scores:  [0.94444444 0.92929293 0.94949495 0.95454545 0.83333333]
Average SMOTE CV Score:  0.896307161345988
Average FROST CV Score:  0.9222222222222222


Using FROST, we have a higher score

#### Logistic Regression

In [17]:
log_mod = LogisticRegression()

log_mod.fit(X_train_smote, y_train_smote)
log_mod.fit(X_train_combined, y_train_combined)

y_predSMOTE = log_mod.predict(X_test)
y_predFROST = log_mod.predict(X_test)

print("Model Evaluation Metrics: SMOTE")
print(classification_report(y_test, y_predSMOTE))
print(confusion_matrix(y_test, y_predSMOTE))

print("\nModel Evaluation Metrics: FROST")
print(classification_report(y_test, y_predFROST))
print(confusion_matrix(y_test, y_predFROST))

Model Evaluation Metrics: SMOTE
              precision    recall  f1-score   support

           0       0.99      0.45      0.61       193
           1       0.05      0.86      0.10         7

    accuracy                           0.46       200
   macro avg       0.52      0.65      0.36       200
weighted avg       0.96      0.46      0.60       200

[[ 86 107]
 [  1   6]]

Model Evaluation Metrics: FROST
              precision    recall  f1-score   support

           0       0.99      0.45      0.61       193
           1       0.05      0.86      0.10         7

    accuracy                           0.46       200
   macro avg       0.52      0.65      0.36       200
weighted avg       0.96      0.46      0.60       200

[[ 86 107]
 [  1   6]]




### Hyperparameter Tuning with SMOTE

In [18]:
# Define a range of hyperparameters to search
param_grid = {
    "penalty": ["l1", "l2"],  # Regularization type
    "C": np.logspace(
        -3, 3, 7
    ),  # Inverse of regularization strength (smaller values for stronger regularization)
    "solver": ["liblinear"],  # Solver for l1 regularization
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(log_mod, param_grid, cv=5, scoring="f1", n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_smote, y_train_smote)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Evaluate the best model on the resampled data
y_pred = best_model.predict(X_train_smote)

print("Model Evaluation Metrics on Resampled Data- SMOTE:")
print(classification_report(y_train_smote, y_pred))
print(confusion_matrix(y_train_smote, y_pred))

Best Hyperparameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Model Evaluation Metrics on Resampled Data- SMOTE:
              precision    recall  f1-score   support

           0       0.65      0.59      0.62       762
           1       0.62      0.68      0.65       762

    accuracy                           0.63      1524
   macro avg       0.64      0.63      0.63      1524
weighted avg       0.64      0.63      0.63      1524

[[447 315]
 [242 520]]


### Hyperparameter Tuning with FROST

In [19]:
# Define a range of hyperparameters to search
param_grid = {
    "penalty": ["l1", "l2"],  # Regularization type
    "C": np.logspace(
        -3, 3, 7
    ),  # Inverse of regularization strength (smaller values for stronger regularization)
    "solver": ["liblinear"],  # Solver for l1 regularization
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(log_mod, param_grid, cv=5, scoring="f1", n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_combined, y_train_combined)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Evaluate the best model on the resampled data
y_pred = best_model.predict(X_train_combined)

print("Model Evaluation Metrics on Resampled Data- FROST:")
print(classification_report(y_train_combined, y_pred))
print(confusion_matrix(y_train_combined, y_pred))

Best Hyperparameters: {'C': 10.0, 'penalty': 'l1', 'solver': 'liblinear'}
Model Evaluation Metrics on Resampled Data- FROST:
              precision    recall  f1-score   support

         0.0       0.78      0.99      0.87       762
         1.0       0.57      0.05      0.10       228

    accuracy                           0.77       990
   macro avg       0.67      0.52      0.48       990
weighted avg       0.73      0.77      0.69       990

[[753   9]
 [216  12]]


### Evaluating with SMOTE for different classifiers

In [20]:
def evaluate_classification_models(X_train_smote, y_train_smote):
    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_train_smote, y_train_smote, test_size=0.2, random_state=42
    )

    # Define a dictionary of classification models
    models = {
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Random Forest Classifier": RandomForestClassifier(),
        "Support Vector Machine (SVM)": SVC(),
        "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
        "Gradient Boosting Classifier": GradientBoostingClassifier(),
    }

    results = {}

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate and store various metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        confusion = confusion_matrix(y_test, y_pred)

        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Confusion Matrix": confusion,
        }

    return results


results = evaluate_classification_models(X_train_smote, y_train_smote)
for model_name, model_result in results.items():
    print(f"Results for {model_name}:")
    for metric, value in model_result.items():
        print(f"{metric}: {value}")
    print()

Results for Decision Tree Classifier:
Accuracy: 0.9311475409836065
Precision: 0.9161676646706587
Recall: 0.95625
F1 Score: 0.9357798165137615
Confusion Matrix: [[131  14]
 [  7 153]]

Results for Random Forest Classifier:
Accuracy: 0.9704918032786886
Precision: 0.9575757575757575
Recall: 0.9875
F1 Score: 0.9723076923076923
Confusion Matrix: [[138   7]
 [  2 158]]

Results for Support Vector Machine (SVM):
Accuracy: 0.9245901639344263
Precision: 0.8743169398907104
Recall: 1.0
F1 Score: 0.9329446064139941
Confusion Matrix: [[122  23]
 [  0 160]]

Results for K-Nearest Neighbors (KNN):
Accuracy: 0.839344262295082
Precision: 0.7655502392344498
Recall: 1.0
F1 Score: 0.8672086720867209
Confusion Matrix: [[ 96  49]
 [  0 160]]

Results for Gradient Boosting Classifier:
Accuracy: 0.9311475409836065
Precision: 0.9483870967741935
Recall: 0.91875
F1 Score: 0.9333333333333333
Confusion Matrix: [[137   8]
 [ 13 147]]



### Evaluating using FROST for different classifiers

In [21]:
def evaluate_classification_models(X_train_combined, y_train_combined):
    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_train_combined, y_train_combined, test_size=0.2, random_state=42
    )

    # Define a dictionary of classification models
    models = {
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Random Forest Classifier": RandomForestClassifier(),
        "Support Vector Machine (SVM)": SVC(),
        "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
        "Gradient Boosting Classifier": GradientBoostingClassifier(),
    }

    results = {}

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate and store various metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        confusion = confusion_matrix(y_test, y_pred)

        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Confusion Matrix": confusion,
        }

    return results


results = evaluate_classification_models(X_train_combined, y_train_combined)
for model_name, model_result in results.items():
    print(f"Results for {model_name}:")
    for metric, value in model_result.items():
        print(f"{metric}: {value}")
    print()

Results for Decision Tree Classifier:
Accuracy: 0.9343434343434344
Precision: 0.803921568627451
Recall: 0.9318181818181818
F1 Score: 0.8631578947368421
Confusion Matrix: [[144  10]
 [  3  41]]

Results for Random Forest Classifier:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[154   0]
 [  0  44]]

Results for Support Vector Machine (SVM):
Accuracy: 0.8484848484848485
Precision: 0.85
Recall: 0.38636363636363635
F1 Score: 0.53125
Confusion Matrix: [[151   3]
 [ 27  17]]

Results for K-Nearest Neighbors (KNN):
Accuracy: 0.8737373737373737
Precision: 0.6376811594202898
Recall: 1.0
F1 Score: 0.7787610619469026
Confusion Matrix: [[129  25]
 [  0  44]]

Results for Gradient Boosting Classifier:
Accuracy: 0.9696969696969697
Precision: 0.9318181818181818
Recall: 0.9318181818181818
F1 Score: 0.9318181818181818
Confusion Matrix: [[151   3]
 [  3  41]]



1. Random Forest Classifier
2. Gradient Boosting Algorithm
3. Decision Tree Classifier
4. K-Nearest Neighbors
5. Support Vector Machine
6. Logistic Regression

### Hyperparameter tuning for Random Forest

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_train_combined, y_train_combined, test_size=0.2, random_state=42
)

# Define the Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)

# Define a range of hyperparameters to search
param_grid = {
    "n_estimators": [50, 100, 150],  # Number of trees in the forest
    "max_depth": [None, 10, 20, 30],  # Maximum depth of the trees
    "min_samples_split": [
        2,
        5,
        10,
    ],  # Minimum number of samples required to split an internal node
    "min_samples_leaf": [
        1,
        2,
        4,
    ],  # Minimum number of samples required to be at a leaf node
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring="f1", n_jobs=-1)

# Fit the grid search to the resampled data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Train the best model on the training data
best_rf_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = best_rf_model.predict(X_test)

# Calculate and print various metrics to evaluate the best model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print("Best Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(confusion)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Model Evaluation Metrics:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
[[154   0]
 [  0  44]]


In [23]:
# saving it as a .pkl file

with open("ml_model.pkl", "wb") as file:
    pickle.dump(best_rf_model, file)

print("Model saved as 'ml_model.pkl'")

Model saved as 'ml_model.pkl'
