In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    return pd.read_csv(file_path)

file_path = 'C:/Users/nejat/AIM Projects/week6 data/data.csv'
df = load_data(file_path)
print(df.head()) 


         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory    ChannelId   Amount  Value  TransactionStart

In [None]:
def split_data(df, target_column):
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df, target_column='FraudResult')

In [5]:
print(df.columns)



Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')


In [18]:
def choose_models():
    log_reg = LogisticRegression()
    random_forest = RandomForestClassifier()
    
    return log_reg, random_forest

log_reg, random_forest = choose_models()

In [25]:
from sklearn.preprocessing import LabelEncoder

non_numeric_cols = X_train.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

for col in non_numeric_cols:
    X_train[col] = label_encoder.fit_transform(X_train[col])

print("Categorical data converted to numeric values using Label Encoding.")


Categorical data converted to numeric values using Label Encoding.


In [22]:
def train_models(models, X_train, y_train):
    trained_models = {}
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        trained_models[model_name] = model
    
    return trained_models

log_reg = LogisticRegression(max_iter=1000) 
models = {'Logistic Regression': log_reg, 'Random Forest': random_forest}

trained_models = train_models(models, X_train, y_train)

In [23]:
def hyperparameter_tuning(model, X_train, y_train, search_type='grid'):
    param_grid_logreg = {
        'C': [0.1, 1, 10],
        'solver': ['liblinear']
    }
    param_grid_rf = {
        'n_estimators': [50, 100],  
        'max_depth': [10, None]     
    }
    
    if isinstance(model, LogisticRegression):
        param_grid = param_grid_logreg
    else:
        param_grid = param_grid_rf
    
    if search_type == 'grid':
        search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    elif search_type == 'random':
        search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, n_iter=10)
    
    search.fit(X_train, y_train)
    
    return search.best_estimator_

best_rf = hyperparameter_tuning(trained_models['Random Forest'], X_train, y_train, search_type='grid')


In [6]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='Good')  
    recall = recall_score(y_test, y_pred, pos_label='Good')
    f1 = f1_score(y_test, y_pred, pos_label='Good')
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model.__class__.__name__}')
    plt.legend()
    plt.show()



In [27]:
print(df.dtypes)



TransactionId            object
BatchId                  object
AccountId                object
SubscriptionId           object
CustomerId               object
CurrencyCode             object
CountryCode               int64
ProviderId               object
ProductId                object
ProductCategory          object
ChannelId                object
Amount                  float64
Value                     int64
TransactionStartTime     object
PricingStrategy           int64
FraudResult               int64
dtype: object
