In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_csv('processed_data.csv')

# Display the first few rows
print(data.head())

# Dropping unique identifier columns and other non-numeric columns
data.drop(columns=['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId'], inplace=True)

# Encoding categorical variables
data = pd.get_dummies(data, columns=['CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId'])

# Convert boolean columns to integers
bool_columns = data.select_dtypes(include=['bool']).columns
data[bool_columns] = data[bool_columns].astype(int)

# Drop the 'TransactionStartTime' column since it is non-numeric
data.drop(columns=['TransactionStartTime'], inplace=True)

# Check the columns and their types to ensure all are numeric
print("Data Types After Encoding and Dropping Columns:\n", data.dtypes)

# Investigate unique values in 'FraudResult' and their counts
print("Unique values in FraudResult before correction:", data['FraudResult'].unique())
print("Counts of unique values in FraudResult before correction:\n", data['FraudResult'].value_counts())

# Ensure the target variable is binary
# Assuming values other than 0 should be treated as 1 (fraudulent)
# Check the minimum and maximum values to understand the range
print("Min value in FraudResult:", data['FraudResult'].min())
print("Max value in FraudResult:", data['FraudResult'].max())

# Correcting the transformation
data['FraudResult'] = data['FraudResult'].apply(lambda x: 1 if x > 0 else 0)

# Verify unique values in target variable after correction
print("Unique values in FraudResult after correction:", data['FraudResult'].unique())
print("Counts of unique values in FraudResult after correction:\n", data['FraudResult'].value_counts())

# Define features and target
X = data.drop('FraudResult', axis=1)
y = data['FraudResult']

# Verify unique values in target variable before split
print("Unique values in target variable before split:", y.unique())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Verify unique values in y_test
print("Unique values in y_test after split:", y_test.unique())

# Check the balance of the classes in y_train and y_test
print("Counts of unique values in y_train after split:\n", y_train.value_counts())
print("Counts of unique values in y_test after split:\n", y_test.value_counts())

# Scale numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model selection and initial training
log_reg = LogisticRegression(max_iter=1000)
dec_tree = DecisionTreeClassifier()
rand_forest = RandomForestClassifier()
gbm = GradientBoostingClassifier()

# Hyperparameter tuning for Logistic Regression
param_grid_log_reg = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search_log_reg = GridSearchCV(log_reg, param_grid_log_reg, cv=5, scoring='roc_auc')
grid_search_log_reg.fit(X_train, y_train)

# Hyperparameter tuning for Decision Tree
param_grid_dec_tree = {'max_depth': [None, 10, 20, 30, 40, 50], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 5, 10]}
grid_search_dec_tree = GridSearchCV(dec_tree, param_grid_dec_tree, cv=5, scoring='roc_auc')
grid_search_dec_tree.fit(X_train, y_train)

# Hyperparameter tuning for Random Forest
param_grid_rand_forest = {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 5]}
random_search_rand_forest = RandomizedSearchCV(rand_forest, param_grid_rand_forest, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search_rand_forest.fit(X_train, y_train)

# Hyperparameter tuning for GBM
param_grid_gbm = {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5], 'subsample': [0.8, 0.9, 1.0]}
random_search_gbm = RandomizedSearchCV(gbm, param_grid_gbm, n_iter=10, cv=5, scoring='roc_auc', random_state=42)
random_search_gbm.fit(X_train, y_train)

# Function to extract GridSearchCV results into a DataFrame
def extract_grid_search_results(grid_search):
    results = pd.DataFrame(grid_search.cv_results_)
    return results

# Extract results
results_log_reg = extract_grid_search_results(grid_search_log_reg)
results_dec_tree = extract_grid_search_results(grid_search_dec_tree)

# Display the results DataFrame
print("Logistic Regression GridSearchCV Results:\n", results_log_reg.head())
print("Decision Tree GridSearchCV Results:\n", results_dec_tree.head())

# Model evaluation
models = {
    'Logistic Regression': grid_search_log_reg.best_estimator_,
    'Decision Tree': grid_search_dec_tree.best_estimator_,
    'Random Forest': random_search_rand_forest.best_estimator_,
    'Gradient Boosting': random_search_gbm.best_estimator_
}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred, pos_label=1)}")
    print(f"Recall: {recall_score(y_test, y_pred, pos_label=1)}")
    print(f"F1 Score: {f1_score(y_test, y_pred, pos_label=1)}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_prob)}")
    print("-" * 30)

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc_score(y_test, y_pred_prob):.2f})')

# Finalize the ROC curve plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load the data
data = pd.read_csv('../data/processed/processed_data.csv')
# Display the first few rows
print(data.head())

FileNotFoundError: [Errno 2] No such file or directory: 'processed_data.csv'

In [None]:
# Dropping unique identifier columns and other non-numeric columns
data.drop(columns=['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId'], inplace=True)