# Data Preparation

In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn

# Load processed data
fraud_data = pd.read_csv('../data/processed/processed_fraud_data_with_country.csv')
credit_card_data = pd.read_csv('../data/processed/processed_credit_card_data.csv')

# Check the structure and columns of the datasets
print("Fraud Data Columns:")
print(fraud_data.columns)
print("\nCredit Card Data Columns:")
print(credit_card_data.columns)

# Feature and target separation for fraud detection data
X_fraud = fraud_data.drop(columns=['class'])  # Features
y_fraud = fraud_data['class']                # Target

# Feature and target separation for credit card data
X_credit = credit_card_data.drop(columns=['Class'])  # Features
y_credit = credit_card_data['Class']                # Target

# Train-test split for fraud data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Train-test split for credit card data
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)

Fraud Data Columns:
Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'age', 'ip_address', 'class', 'ip_address_int',
       'transaction_count',
       ...
       'country_Unknown', 'country_Uruguay', 'country_Uzbekistan',
       'country_Vanuatu', 'country_Venezuela', 'country_Viet Nam',
       'country_Virgin Islands (U.S.)', 'country_Yemen', 'country_Zambia',
       'country_Zimbabwe'],
      dtype='object', length=203)

Credit Card Data Columns:
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


# Model Selection, Training, and Evaluation

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier()
}

# Train and evaluate models for fraud detection data
for name, model in models.items():
    with mlflow.start_run(run_name=f'{name} on Fraud Data'):
        model.fit(X_train_fraud, y_train_fraud)
        y_pred = model.predict(X_test_fraud)
        accuracy = accuracy_score(y_test_fraud, y_pred)
        
        # Log parameters and metrics
        mlflow.log_params({
            'model': name,
            'dataset': 'Fraud Data',
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)
        
        # Save the model
        mlflow.sklearn.log_model(model, f'{name}_model')
        
        # Print and log classification report
        print(f"Model: {name}")
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test_fraud, y_pred))
        print("="*60)

In [None]:
# Train and evaluate models for credit card data
for name, model in models.items():
    with mlflow.start_run(run_name=f'{name} on Credit Card Data'):
        model.fit(X_train_credit, y_train_credit)
        y_pred = model.predict(X_test_credit)
        accuracy = accuracy_score(y_test_credit, y_pred)
        
        # Log parameters and metrics
        mlflow.log_params({
            'model': name,
            'dataset': 'Credit Card Data',
            'test_size': 0.2,
            'random_state': 42
        })
        mlflow.log_metric('accuracy', accuracy)
        
        # Save the model
        mlflow.sklearn.log_model(model, f'{name}_model')
        
        # Print and log classification report
        print(f"Model: {name}")
        print(f"Accuracy: {accuracy}")
        print(classification_report(y_test_credit, y_pred))
        print("="*60)