In [93]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

# Load the dataset
data = pd.read_csv('../data/processed/processed_data.csv')

# Display the first few rows
print(data.head())

   TransactionId  BatchId  AccountId  SubscriptionId  CustomerId CurrencyCode  \
0          78150    46980       2490            3535        2584          UGX   
1          75821    31755       3219            2366        2584          UGX   
2          39888    60272       2713             996        2806          UGX   
3          48738     1797       3351             974        3733          UGX   
4          41364    48941       3219            2366        3733          UGX   

   CountryCode    ProviderId     ProductId     ProductCategory  ...  \
0          0.0  ProviderId_6  ProductId_10             airtime  ...   
1          0.0  ProviderId_4   ProductId_6  financial_services  ...   
2          0.0  ProviderId_6   ProductId_1             airtime  ...   
3          0.0  ProviderId_1  ProductId_21        utility_bill  ...   
4          0.0  ProviderId_4   ProductId_6  financial_services  ...   

  PricingStrategy  FraudResult  TotalTransactionAmount  \
0       -0.349252    -0.0449

In [94]:
# Dropping unique identifier columns and other non-numeric columns
data.drop(columns=['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId'], inplace=True)

# Encoding categorical variables
data = pd.get_dummies(data, columns=['CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId'])

# Scaling numeric features
scaler = StandardScaler()
numeric_features = ['TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionCount', 'TransactionStdDev']
data[numeric_features] = scaler.fit_transform(data[numeric_features])

# Check the columns and their types to ensure all are numeric
print(data.dtypes)

CustomerId                              int64
CountryCode                           float64
Amount                                float64
Value                                 float64
TransactionStartTime                   object
PricingStrategy                       float64
FraudResult                           float64
TotalTransactionAmount                float64
AverageTransactionAmount              float64
TransactionCount                      float64
TransactionStdDev                     float64
TransactionHour                         int64
TransactionDay                          int64
TransactionMonth                        int64
TransactionYear                         int64
CurrencyCode_UGX                         bool
ProviderId_ProviderId_1                  bool
ProviderId_ProviderId_2                  bool
ProviderId_ProviderId_3                  bool
ProviderId_ProviderId_4                  bool
ProviderId_ProviderId_5                  bool
ProviderId_ProviderId_6           

In [96]:
# Convert boolean columns to integers
bool_columns = data.select_dtypes(include=['bool']).columns
data[bool_columns] = data[bool_columns].astype(int)

# Drop the 'TransactionStartTime' column since it is non-numeric
data.drop(columns=['TransactionStartTime'], inplace=True)

In [98]:
# Convert boolean columns to integers
bool_columns = data.select_dtypes(include=['bool']).columns
data[bool_columns] = data[bool_columns].astype(int)

# Check the columns and their types to ensure all are numeric
print(data.dtypes)

CustomerId                              int64
CountryCode                           float64
Amount                                float64
Value                                 float64
PricingStrategy                       float64
FraudResult                           float64
TotalTransactionAmount                float64
AverageTransactionAmount              float64
TransactionCount                      float64
TransactionStdDev                     float64
TransactionHour                         int64
TransactionDay                          int64
TransactionMonth                        int64
TransactionYear                         int64
CurrencyCode_UGX                        int32
ProviderId_ProviderId_1                 int32
ProviderId_ProviderId_2                 int32
ProviderId_ProviderId_3                 int32
ProviderId_ProviderId_4                 int32
ProviderId_ProviderId_5                 int32
ProviderId_ProviderId_6                 int32
ProductId_ProductId_1             

In [99]:
# Calculate the correlation with the target variable
correlation = data.corr()['FraudResult'].sort_values(ascending=False)
print(correlation)

FraudResult                           1.000000
Value                                 0.566739
Amount                                0.557370
TransactionStdDev                     0.355529
AverageTransactionAmount              0.339021
ProductId_ProductId_15                0.093570
ProviderId_ProviderId_3               0.076205
TotalTransactionAmount                0.063469
ProductId_ProductId_9                 0.051832
ProviderId_ProviderId_1               0.045104
ChannelId_ChannelId_3                 0.032811
ProductCategory_financial_services    0.032375
ProductId_ProductId_5                 0.028099
ProductCategory_transport             0.028099
ProductId_ProductId_13                0.022662
ProviderId_ProviderId_5               0.022490
ProductCategory_utility_bill          0.013499
ProductId_ProductId_22                0.012086
CustomerId                            0.010927
TransactionYear                       0.009811
ChannelId_ChannelId_1                 0.009080
TransactionHo

In [100]:
# Ensure the target variable is categorical
# If `FraudResult` is continuous, we need to binarize it.
# Assuming `FraudResult` should be 0 or 1, you might need to adjust this based on your data.
data['FraudResult'] = data['FraudResult'].astype(int)

In [101]:
# Define features and target
X = data.drop('FraudResult', axis=1)
y = data['FraudResult']

In [102]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [104]:
# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [105]:
# Model selection and initial training
log_reg = LogisticRegression(max_iter=1000)
dec_tree = DecisionTreeClassifier()
rand_forest = RandomForestClassifier()
gbm = GradientBoostingClassifier()

In [106]:
# Initialize the models
# Hyperparameter tuning for Logistic Regression
param_grid_log_reg = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv=5, scoring='roc_auc')
grid_search_log_reg.fit(X_train, y_train)

In [107]:
# Hyperparameter tuning for Decision Tree
param_grid_dec_tree = {'max_depth': [None, 10, 20, 30, 40, 50], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 5, 10]}
grid_search_dec_tree = GridSearchCV(dec_tree, param_grid_dec_tree, cv=5, scoring='roc_auc')
grid_search_dec_tree.fit(X_train, y_train)

In [108]:
# Hyperparameter tuning for Random Forest
param_grid_rand_forest = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 5]}
random_search_rand_forest = RandomizedSearchCV(rand_forest, param_grid_rand_forest, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search_rand_forest.fit(X_train, y_train)

In [109]:
# Hyperparameter tuning for GBM
param_grid_gbm = {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5], 'subsample': [0.8, 0.9, 1.0]}
random_search_gbm = RandomizedSearchCV(gbm, param_grid_gbm, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search_gbm.fit(X_train, y_train)

In [115]:
# Function to extract GridSearchCV results into a DataFrame
def extract_grid_search_results(grid_search):
    results = pd.DataFrame(grid_search.cv_results_)
    return results

In [111]:
# Extract results
results_log_reg = extract_grid_search_results(grid_search_log_reg)
results_dec_tree = extract_grid_search_results(grid_search_dec_tree)

In [112]:
# Display the results DataFrame
print("Logistic Regression GridSearchCV Results:\n", results_log_reg.head())
print("Decision Tree GridSearchCV Results:\n", results_dec_tree.head())

Logistic Regression GridSearchCV Results:
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_C  \
0       0.186811      0.045093         0.008554        0.005109     0.01   
1       0.316358      0.083080         0.006401        0.003482     0.10   
2       0.308531      0.064676         0.005148        0.002816     1.00   
3       0.283319      0.058158         0.006888        0.002697    10.00   
4       0.271007      0.044258         0.005339        0.002089   100.00   

        params  split0_test_score  split1_test_score  split2_test_score  \
0  {'C': 0.01}           0.994910           0.997648           0.990620   
1   {'C': 0.1}           0.989027           0.998533           0.979475   
2     {'C': 1}           0.981421           0.998250           0.966814   
3    {'C': 10}           0.983219           0.998250           0.964924   
4   {'C': 100}           0.983136           0.998258           0.963521   

   split3_test_score  split4_test_score  mean_tes

In [116]:
# Generate contour and surface plots for GridSearchCV results
def plot_grid_search_results(results, param1, param2, model_name):
    # Filter out params and scores
    params = results[['param_' + param1, 'param_' + param2, 'mean_test_score']]
    params = params.dropna()

    # Create meshgrid for plotting
    param1_values = np.array(params['param_' + param1].astype(float))
    param2_values = np.array(params['param_' + param2].astype(float))
    scores = np.array(params['mean_test_score'])

    param1_values, param2_values = np.meshgrid(np.unique(param1_values), np.unique(param2_values))
    scores = scores.reshape(param1_values.shape)

    fig = plt.figure(figsize=(12, 6))

        # Contour plot
    ax1 = fig.add_subplot(121)
    contour = ax1.contourf(param1_values, param2_values, scores, cmap=cm.coolwarm)
    fig.colorbar(contour, ax=ax1)
    ax1.set_xlabel(param1)
    ax1.set_ylabel(param2)
    ax1.set_title(f'{model_name} - Contour Plot')

    # Surface plot
    ax2 = fig.add_subplot(122, projection='3d')
    surface = ax2.plot_surface(param1_values, param2_values, scores, cmap=cm.coolwarm)
    fig.colorbar(surface, ax=ax2, shrink=0.5, aspect=5)
    ax2.set_xlabel(param1)
    ax2.set_ylabel(param2)
    ax2.set_zlabel('Mean Test Score')
    ax2.set_title(f'{model_name} - Surface Plot')
    
    plt.show()

In [117]:
# Plot results for Logistic Regression
plot_grid_search_results(results_log_reg, 'C', 'mean_test_score', 'Logistic Regression')

KeyError: "['param_mean_test_score'] not in index"

In [118]:
# Plot results for Decision Tree
plot_grid_search_results(results_dec_tree, 'max_depth', 'min_samples_split', 'Decision Tree')

ValueError: cannot reshape array of size 60 into shape (3,5)

In [119]:
# Model evaluation
models = {
    'Logistic Regression': grid_search_log_reg.best_estimator_,
    'Decision Tree': grid_search_dec_tree.best_estimator_,
    'Random Forest': random_search_rand_forest.best_estimator_,
    'Gradient Boosting': random_search_gbm.best_estimator_
}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_prob)}")
    print("-" * 30)

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc_score(y_test, y_pred_prob):.2f})')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

Model: Logistic Regression
Accuracy: 0.9982577790166904


ValueError: pos_label=1 is not a valid label. It should be one of [0, 22]