## Machine Learning Model for Fraud Detection

In [None]:
import pandas as pd
import numpy as np
import pickle

# Preprocessing
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Model Selection and Cross-Validation
from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
    GridSearchCV,
)

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE

In [2]:
fraud = pd.read_csv("complete_dataset.csv")
fraud.columns

Index(['TransactionID', 'Timestamp', 'MerchantID', 'Amount', 'CustomerID',
       'TransactionAmount', 'AnomalyScore', 'FraudIndicator', 'Category',
       'MerchantName', 'MerchantLocation', 'CustomerName', 'CustomerAge',
       'CustomerAddress'],
      dtype='object')

In [None]:
columns_to_be_dropped = [
    "TransactionID",
    "MerchantID",
    "CustomerID",
    "CustomerName",
    "MerchantName",
    "MerchantLocation",
    "CustomerAddress",
]

In [None]:
fraud1 = fraud.drop(columns_to_be_dropped, axis=1)
fraud1.columns

Index(['Timestamp', 'Amount', 'TransactionAmount', 'AnomalyScore',
       'FraudIndicator', 'Category', 'CustomerAge'],
      dtype='object')

In [5]:
fraud1["FraudIndicator"].value_counts()

FraudIndicator
0    955
1     45
Name: count, dtype: int64

This dataset is very imbalanced as the number of cases which are fraudulent are very few. Thus, the models would not be able to predict these cases very accurately.

#### Feature Engineering

In [6]:
# converting the TimeStamp to a datetime format
fraud1["Timestamp"] = pd.to_datetime(fraud1["Timestamp"])
print(fraud1.dtypes)

Timestamp            datetime64[ns]
Amount                      float64
TransactionAmount           float64
AnomalyScore                float64
FraudIndicator                int64
Category                     object
CustomerAge                   int64
dtype: object


In [7]:
# Extract useful time-based features
fraud1["Hour"] = fraud1["Timestamp"].dt.hour
fraud1["Day"] = fraud1["Timestamp"].dt.day
fraud1["Month"] = fraud1["Timestamp"].dt.month
fraud1["Weekday"] = fraud1["Timestamp"].dt.weekday
fraud1["Year"] = fraud1["Timestamp"].dt.year

In [None]:
X = fraud1.drop(["FraudIndicator", "Timestamp"], axis=1)
y = fraud1["FraudIndicator"]

In [9]:
# initializing LabelEncoder
label_encoder = LabelEncoder()

# fit and transform the Category column
X["Category"] = label_encoder.fit_transform(X["Category"])
X.head(10)

Unnamed: 0,Amount,TransactionAmount,AnomalyScore,Category,CustomerAge,Hour,Day,Month,Weekday,Year
0,55.530334,79.413607,0.686699,2,50,0,1,1,5,2022
1,12.88118,12.053087,0.081749,1,46,1,1,1,5,2022
2,50.176322,33.310357,0.023857,4,34,2,1,1,5,2022
3,41.634001,46.121117,0.876994,4,33,3,1,1,5,2022
4,78.122853,54.051618,0.034059,2,18,4,1,1,5,2022
5,86.947084,34.545138,0.121173,0,45,5,1,1,5,2022
6,51.147096,55.383113,0.109892,0,25,6,1,1,5,2022
7,56.163984,17.855878,0.780534,4,27,7,1,1,5,2022
8,37.182412,75.659944,0.010471,3,20,8,1,1,5,2022
9,17.245409,67.931879,0.029376,2,55,9,1,1,5,2022


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
# checking the sizes
X_train.shape, y_test.shape

((800, 10), (200,))

#### Logistic Regression Model

In [12]:
log_mod = LogisticRegression()

log_mod.fit(X_train, y_train)

y_pred = log_mod.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.935
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       187
           1       0.00      0.00      0.00        13

    accuracy                           0.94       200
   macro avg       0.47      0.50      0.48       200
weighted avg       0.87      0.94      0.90       200



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features to [0, 1] range
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE for oversampling
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)


# Define FROST function
def generate_frost_samples(X_minority, initial_feature_index, k=5, m=1.5):
    initial_feature_values = X_minority[:, initial_feature_index]
    similarity_matrix = 1 / (
        1 + np.abs(initial_feature_values[:, np.newaxis] - initial_feature_values)
    )
    k_nearest_indices = np.argsort(similarity_matrix, axis=1)[:, -k:]
    synthetic_samples_initial = []
    for i in range(len(initial_feature_values)):
        for j in k_nearest_indices[i]:
            synthetic_value = initial_feature_values[i] + m * (
                initial_feature_values[j] - initial_feature_values[i]
            )
            synthetic_sample = np.copy(X_minority[i])
            synthetic_sample[initial_feature_index] = synthetic_value
            synthetic_samples_initial.append(synthetic_sample)
    return np.array(synthetic_samples_initial)


# Apply FROST for oversampling
initial_feature_index = 0  # Choose the index of the initial feature to oversample
X_train_frost = generate_frost_samples(
    X_train_scaled[y_train == 1], initial_feature_index, k=5, m=1.5
)

# Combine original and synthetic samples
X_train_combined = np.vstack((X_train_scaled, X_train_frost))
y_train_combined = np.concatenate((y_train, np.ones(len(X_train_frost))))

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Define the number of folds for k-fold cross-validation
k_folds = KFold(n_splits=5)

# Perform cross-validation and calculate the scores for SMOTE
scores_smote = cross_val_score(clf, X_train_smote, y_train_smote, cv=k_folds)

# Perform cross-validation and calculate the scores for FROST
scores_frost = cross_val_score(clf, X_train_combined, y_train_combined, cv=k_folds)

# Print the cross-validation scores for each fold
print("SMOTE Cross Validation Scores: ", scores_smote)
print("FROST Cross Validation Scores: ", scores_frost)

# Print the average cross-validation score
print("Average SMOTE CV Score: ", scores_smote.mean())
print("Average FROST CV Score: ", scores_frost.mean())

SMOTE Cross Validation Scores:  [0.84262295 0.90491803 0.93442623 0.96393443 0.94078947]
FROST Cross Validation Scores:  [0.90909091 0.90909091 0.93434343 0.93434343 0.78787879]
Average SMOTE CV Score:  0.9173382226056945
Average FROST CV Score:  0.8949494949494949


Using FROST, we have a higher score

#### Logistic Regression

In [15]:
log_mod = LogisticRegression()

log_mod.fit(X_train_smote, y_train_smote)
log_mod.fit(X_train_combined, y_train_combined)

y_predSMOTE = log_mod.predict(X_test)
y_predFROST = log_mod.predict(X_test)

print("Model Evaluation Metrics: SMOTE")
print(classification_report(y_test, y_predSMOTE))
print(confusion_matrix(y_test, y_predSMOTE))

print("\nModel Evaluation Metrics: FROST")
print(classification_report(y_test, y_predFROST))
print(confusion_matrix(y_test, y_predFROST))

Model Evaluation Metrics: SMOTE
              precision    recall  f1-score   support

           0       0.96      0.87      0.91       193
           1       0.00      0.00      0.00         7

    accuracy                           0.83       200
   macro avg       0.48      0.43      0.46       200
weighted avg       0.93      0.83      0.88       200

[[167  26]
 [  7   0]]

Model Evaluation Metrics: FROST
              precision    recall  f1-score   support

           0       0.96      0.87      0.91       193
           1       0.00      0.00      0.00         7

    accuracy                           0.83       200
   macro avg       0.48      0.43      0.46       200
weighted avg       0.93      0.83      0.88       200

[[167  26]
 [  7   0]]




### Hyperparameter Tuning with SMOTE

In [None]:
# Define a range of hyperparameters to search
param_grid = {
    "penalty": ["l1", "l2"],  # Regularization type
    "C": np.logspace(
        -3, 3, 7
    ),  # Inverse of regularization strength (smaller values for stronger regularization)
    "solver": ["liblinear"],  # Solver for l1 regularization
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(log_mod, param_grid, cv=5, scoring="f1", n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_smote, y_train_smote)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Evaluate the best model on the resampled data
y_pred = best_model.predict(X_train_smote)

print("Model Evaluation Metrics on Resampled Data- SMOTE:")
print(classification_report(y_train_smote, y_pred))
print(confusion_matrix(y_train_smote, y_pred))

Best Hyperparameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
Model Evaluation Metrics on Resampled Data- SMOTE:
              precision    recall  f1-score   support

           0       0.61      0.60      0.61       762
           1       0.61      0.62      0.62       762

    accuracy                           0.61      1524
   macro avg       0.61      0.61      0.61      1524
weighted avg       0.61      0.61      0.61      1524

[[460 302]
 [289 473]]


### Hyperparameter Tuning with FROST

In [None]:
# Define a range of hyperparameters to search
param_grid = {
    "penalty": ["l1", "l2"],  # Regularization type
    "C": np.logspace(
        -3, 3, 7
    ),  # Inverse of regularization strength (smaller values for stronger regularization)
    "solver": ["liblinear"],  # Solver for l1 regularization
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(log_mod, param_grid, cv=5, scoring="f1", n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_combined, y_train_combined)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Evaluate the best model on the resampled data
y_pred = best_model.predict(X_train_combined)

print("Model Evaluation Metrics on Resampled Data- FROST:")
print(classification_report(y_train_combined, y_pred))
print(confusion_matrix(y_train_combined, y_pred))

Best Hyperparameters: {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}
Model Evaluation Metrics on Resampled Data- FROST:
              precision    recall  f1-score   support

         0.0       0.77      1.00      0.87       762
         1.0       0.00      0.00      0.00       228

    accuracy                           0.77       990
   macro avg       0.38      0.50      0.43       990
weighted avg       0.59      0.77      0.67       990

[[762   0]
 [228   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Evaluating with SMOTE for different classifiers

In [None]:
def evaluate_classification_models(X_train_smote, y_train_smote):
    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_train_smote, y_train_smote, test_size=0.2, random_state=42
    )

    # Define a dictionary of classification models
    models = {
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Random Forest Classifier": RandomForestClassifier(),
        "Support Vector Machine (SVM)": SVC(),
        "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
        "Gradient Boosting Classifier": GradientBoostingClassifier(),
    }

    results = {}

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate and store various metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        confusion = confusion_matrix(y_test, y_pred)

        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Confusion Matrix": confusion,
        }

    return results


results = evaluate_classification_models(X_train_smote, y_train_smote)
for model_name, model_result in results.items():
    print(f"Results for {model_name}:")
    for metric, value in model_result.items():
        print(f"{metric}: {value}")
    print()

Results for Decision Tree Classifier:
Accuracy: 0.9475409836065574
Precision: 0.9444444444444444
Recall: 0.95625
F1 Score: 0.9503105590062112
Confusion Matrix: [[136   9]
 [  7 153]]

Results for Random Forest Classifier:
Accuracy: 0.9606557377049181
Precision: 0.9567901234567902
Recall: 0.96875
F1 Score: 0.9627329192546584
Confusion Matrix: [[138   7]
 [  5 155]]

Results for Support Vector Machine (SVM):
Accuracy: 0.9147540983606557
Precision: 0.8722222222222222
Recall: 0.98125
F1 Score: 0.9235294117647059
Confusion Matrix: [[122  23]
 [  3 157]]

Results for K-Nearest Neighbors (KNN):
Accuracy: 0.8852459016393442
Precision: 0.8205128205128205
Recall: 1.0
F1 Score: 0.9014084507042254
Confusion Matrix: [[110  35]
 [  0 160]]

Results for Gradient Boosting Classifier:
Accuracy: 0.9540983606557377
Precision: 0.9506172839506173
Recall: 0.9625
F1 Score: 0.9565217391304348
Confusion Matrix: [[137   8]
 [  6 154]]



### Evaluating using FROST for different classifiers

In [None]:
def evaluate_classification_models(X_train_combined, y_train_combined):
    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_train_combined, y_train_combined, test_size=0.2, random_state=42
    )

    # Define a dictionary of classification models
    models = {
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "Random Forest Classifier": RandomForestClassifier(),
        "Support Vector Machine (SVM)": SVC(),
        "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
        "Gradient Boosting Classifier": GradientBoostingClassifier(),
    }

    results = {}

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate and store various metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        confusion = confusion_matrix(y_test, y_pred)

        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1,
            "Confusion Matrix": confusion,
        }

    return results


results = evaluate_classification_models(X_train_combined, y_train_combined)
for model_name, model_result in results.items():
    print(f"Results for {model_name}:")
    for metric, value in model_result.items():
        print(f"{metric}: {value}")
    print()

Results for Decision Tree Classifier:
Accuracy: 0.9242424242424242
Precision: 0.7735849056603774
Recall: 0.9318181818181818
F1 Score: 0.845360824742268
Confusion Matrix: [[142  12]
 [  3  41]]

Results for Random Forest Classifier:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix: [[154   0]
 [  0  44]]

Results for Support Vector Machine (SVM):
Accuracy: 0.8131313131313131
Precision: 0.7333333333333333
Recall: 0.25
F1 Score: 0.3728813559322034
Confusion Matrix: [[150   4]
 [ 33  11]]

Results for K-Nearest Neighbors (KNN):
Accuracy: 0.8787878787878788
Precision: 0.6470588235294118
Recall: 1.0
F1 Score: 0.7857142857142857
Confusion Matrix: [[130  24]
 [  0  44]]

Results for Gradient Boosting Classifier:
Accuracy: 0.9595959595959596
Precision: 0.95
Recall: 0.8636363636363636
F1 Score: 0.9047619047619048
Confusion Matrix: [[152   2]
 [  6  38]]



1. Random Forest Classifier
2. Gradient Boosting Algorithm
3. Decision Tree Classifier
4. K-Nearest Neighbors
5. Support Vector Machine
6. Logistic Regression

### Hyperparameter tuning for Random Forest

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_train_combined, y_train_combined, test_size=0.2, random_state=42
)

# Define the Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)

# Define a range of hyperparameters to search
param_grid = {
    "n_estimators": [50, 100, 150],  # Number of trees in the forest
    "max_depth": [None, 10, 20, 30],  # Maximum depth of the trees
    "min_samples_split": [
        2,
        5,
        10,
    ],  # Minimum number of samples required to split an internal node
    "min_samples_leaf": [
        1,
        2,
        4,
    ],  # Minimum number of samples required to be at a leaf node
}

# Create a grid search with cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring="f1", n_jobs=-1)

# Fit the grid search to the resampled data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and corresponding model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Train the best model on the training data
best_rf_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = best_rf_model.predict(X_test)

# Calculate and print various metrics to evaluate the best model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print("Best Model Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(confusion)

Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Model Evaluation Metrics:
Accuracy: 0.9949494949494949
Precision: 0.9777777777777777
Recall: 1.0
F1 Score: 0.9887640449438202
Confusion Matrix:
[[153   1]
 [  0  44]]


In [21]:
# saving it as a .pkl file

with open("ml_model.pkl", "wb") as file:
    pickle.dump(best_rf_model, file)

print("Model saved as 'ml_model.pkl'")

Model saved as 'ml_model.pkl'
