In [39]:
# Import Libraries
import mlflow
import mlflow.sklearn

import numpy as np
from itertools import product

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

In [43]:
# Load Data
data_file = "../data/cleaned_processed_data.csv"
print(f"Loading data from {data_file}...")

merged_data_clean = pd.read_csv(data_file)

# Define Features and Target
features = [
    "CERQ_Sum", "ERQ_Sum", "MSPSS_Sum", "FSoZu_Sum", 
    "BISBAS_Total", "NEO_Sum", "STAI_Sum", "STAXI_Sum", 
    "CVLT_Sum", "TAP_Sum", "BloodPressure_Mean", "Age_Numeric"
]
target = "Relationship_Status"

# Train-Test Split
X = merged_data_clean[features]
y = merged_data_clean[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Data loaded successfully with {len(X_train)} training samples and {len(X_test)} test samples.")

# Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Loading data from ../data/cleaned_processed_data.csv...
Data loaded successfully with 181 training samples and 46 test samples.


In [44]:
# Set MLflow Experiment
mlflow.set_experiment("LEMON_Prediction_Relationship")

# Start the MLflow UI (Run this in the terminal if needed)
#!mlflow ui

<Experiment: artifact_location='file:///Users/samuel/Desktop/EKU%20Tu%CC%88bingen/data_literacy/project/repo/LEMON-Love-Predictor/code/mlruns/519230377073851292', creation_time=1734621565013, experiment_id='519230377073851292', last_update_time=1734621565013, lifecycle_stage='active', name='LEMON_Prediction_Relationship', tags={}>

In [45]:
# Define Model Training Function
def train_model(model_type, max_iter, C, n_estimators, kernel):
    # Load Data
    X_train, X_test, y_train, y_test = load_data()

    # Initialize Model
    if model_type == "Logistic Regression":
        model = LogisticRegression(max_iter=max_iter, C=C, solver='lbfgs')
    elif model_type == "Random Forest":
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=10, random_state=42)
    elif model_type == "SVM":
        model = SVC(C=C, kernel=kernel, max_iter=max_iter, probability=True)
    
    # Train the Model
    model.fit(X_train, y_train)

    # Evaluate the Model
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    print(f"Model: {model_type}, Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    # Log Model and Metrics in MLflow
    with mlflow.start_run(run_name=f"{model_type}_{kernel}_run"):
        mlflow.log_param("model_type", model_type)
        mlflow.log_param("max_iter", max_iter)
        mlflow.log_param("C", C)
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("kernel", kernel)
        mlflow.log_metric("accuracy", acc)
        
        mlflow.sklearn.log_model(model, artifact_path="model")

In [46]:
# Define Hyperparameter Sweep Config
sweep_config = {
    "model_type": [ "Random Forest", "SVM"],
    "max_iter": list(range(100, 1001, 100)),
    "C": np.linspace(0.1, 10, 10).tolist(),
    "n_estimators": list(range(50, 301, 50)),
    "kernel": ["linear", "rbf"]
}

# Create All Combinations
sweep_params = list(product(
    sweep_config["model_type"], 
    sweep_config["max_iter"], 
    sweep_config["C"], 
    sweep_config["n_estimators"], 
    sweep_config["kernel"]
))

In [None]:
# Execute Model Training for All Combinations
for params in sweep_params:
    model_type, max_iter, C, n_estimators, kernel = params

    # Skip Irrelevant Combinations
    if model_type == "Logistic Regression" and kernel != "linear":
        continue
    
    train_model(model_type, max_iter, C, n_estimators, kernel)