# Data Preparation

In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Load processed data
fraud_data = pd.read_csv('../data/processed/processed_fraud_data_with_country.csv')
credit_card_data = pd.read_csv('../data/processed/processed_credit_card_data.csv')

# Ensure datetime columns are correctly parsed
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Feature Engineering for fraud_data
# Extract numerical features from datetime columns
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour
fraud_data['signup_day'] = fraud_data['signup_time'].dt.day
fraud_data['signup_month'] = fraud_data['signup_time'].dt.month
fraud_data['signup_year'] = fraud_data['signup_time'].dt.year

fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.day
fraud_data['purchase_month'] = fraud_data['purchase_time'].dt.month
fraud_data['purchase_year'] = fraud_data['purchase_time'].dt.year

# Drop original datetime columns
fraud_data = fraud_data.drop(columns=['signup_time', 'purchase_time'])

# Identify categorical columns
categorical_columns = fraud_data.select_dtypes(include=['object']).columns.tolist()

# Encode categorical features
fraud_data = pd.get_dummies(fraud_data, columns=categorical_columns, drop_first=True)

# Check the structure and columns of the datasets
print("Fraud Data Columns:")
print(fraud_data.columns)
print("\nCredit Card Data Columns:")
print(credit_card_data.columns)

# Feature and target separation for fraud detection data
X_fraud = fraud_data.drop(columns=['class'])  # Features
y_fraud = fraud_data['class']                # Target

# Feature and target separation for credit card data
X_credit = credit_card_data.drop(columns=['Class'])  # Features
y_credit = credit_card_data['Class']                # Target

# Train-test split for fraud data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Train-test split for credit card data
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

AttributeError: partially initialized module 'mlflow' has no attribute 'version' (most likely due to a circular import)

# Model Selection, Training, and Evaluation

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier(max_iter=1000)
}

# Train and evaluate models for fraud detection data
for name, model in models.items():
    with mlflow.start_run(run_name=f'{name} on Fraud Data'):
        model.fit(X_train_fraud, y_train_fraud)
        y_pred = model.predict(X_test_fraud)
        accuracy = accuracy_score(y_test_fraud, y_pred)
        
        # Log parameters and metrics
        mlflow.log_params({
            'model': name,
            'dataset': 'Fraud Data',
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)
        
        # Save the model
        mlflow.sklearn.log_model(model, f'{name}_model')
        
        # Print and log classification report
        print(f"Model: {name}")
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test_fraud, y_pred))
        print("="*60)

In [None]:
# Train and evaluate models for credit card data
for name, model in models.items():
    with mlflow.start_run(run_name=f'{name} on Credit Card Data'):
        model.fit(X_train_credit, y_train_credit)
        y_pred = model.predict(X_test_credit)
        accuracy = accuracy_score(y_test_credit, y_pred)
        
        # Log parameters and metrics
        mlflow.log_params({
            'model': name,
            'dataset': 'Credit Card Data',
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)
        
        # Save the model
        mlflow.sklearn.log_model(model, f'{name}_model')
        
        # Print and log classification report
        print(f"Model: {name}")
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test_credit, y_pred))
        print("="*60)