In [16]:
import sqlite3
import json
import os
from typing import List, Optional, Dict, Any
from dotenv import load_dotenv
import numpy as np
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score

from trained_model import ModelRepository
from dataset_repo import DatasetRepository


def load_model_training_config() -> Dict[str, Any]:
    """
    Load configuration from .env file.
    This is a placeholder - you will implement this function.
    
    Returns:
        Dictionary with all configuration keys and values
    """
    # Example implementation - replace with your actual .env loading logic
    import os
    from dotenv import load_dotenv
    
    load_dotenv()
    
    config = {
        # Database config
        'DATABASE_PATH': os.getenv('DATABASE_PATH', '../../data/database.sqlite'),
        
        # Training config
        'TRAIN_RUN_ID': int(os.getenv('TRAIN_RUN_ID', '1')),
        'TRAIN_TEST_SIZE': float(os.getenv('TRAIN_TEST_SIZE', '0.2')),
        'TRAIN_RANDOM_STATE': int(os.getenv('TRAIN_RANDOM_STATE', '42')),
        
        # Logistic Regression hyperparameters
        'LR_MAX_ITER': int(os.getenv('LR_MAX_ITER', '1000')),
        'LR_SOLVER': os.getenv('LR_SOLVER', 'lbfgs'),
        'LR_MULTI_CLASS': os.getenv('LR_MULTI_CLASS', 'auto'),
        'LR_C': float(os.getenv('LR_C', '1.0')),
        'LR_PENALTY': os.getenv('LR_PENALTY', 'l2'),
        'LR_TOL': float(os.getenv('LR_TOL', '0.0001')),
        'LR_FIT_INTERCEPT': os.getenv('LR_FIT_INTERCEPT', 'True').lower() == 'true',
        'LR_WARM_START': os.getenv('LR_WARM_START', 'False').lower() == 'true',
        'LR_N_JOBS': int(os.getenv('LR_N_JOBS', '-1')) if os.getenv('LR_N_JOBS') != 'None' else None,
        
        # Data processing config
        'MIN_SAMPLES_PER_CLASS': int(os.getenv('MIN_SAMPLES_PER_CLASS', '1')),
        'STRATIFY_SPLIT': os.getenv('STRATIFY_SPLIT', 'True').lower() == 'true',
        'SHUFFLE_SPLIT': os.getenv('SHUFFLE_SPLIT', 'True').lower() == 'true',
        
        # Model storage config
        'STORE_COEFFICIENTS_SEPARATELY': os.getenv('STORE_COEFFICIENTS_SEPARATELY', 'False').lower() == 'true',
        'FLATTEN_WEIGHTS': os.getenv('FLATTEN_WEIGHTS', 'True').lower() == 'true',
    }
    
    return config

def train_model_for_run(run_id) -> dict:
    """
    Train a logistic regression model for a specific run_id using configuration.
    
    Args:
        config: Dictionary containing all configuration values.
                If None, loads config using load_model_training_config()
        
    Returns:
        Dictionary with training results and model info
    """
    
    config = load_model_training_config()
    
    # =========================================================================
    # EXTRACT ALL CONFIGURABLE VALUES
    # =========================================================================
    
    # Database configuration
    db_path = config['DATABASE_PATH']

    # Training configuration
    run_id = config['TRAIN_RUN_ID']
    test_size = config['TRAIN_TEST_SIZE']
    random_state = config['TRAIN_RANDOM_STATE']
    
    # Logistic Regression hyperparameters
    lr_max_iter = config['LR_MAX_ITER']
    lr_solver = config['LR_SOLVER']
    lr_multi_class = config['LR_MULTI_CLASS']
    lr_c = config['LR_C']
    lr_penalty = config['LR_PENALTY']
    lr_tol = config['LR_TOL']
    lr_fit_intercept = config['LR_FIT_INTERCEPT']
    lr_warm_start = config['LR_WARM_START']
    lr_n_jobs = config['LR_N_JOBS']
    
    # Data processing configuration
    min_samples_per_class = config['MIN_SAMPLES_PER_CLASS']
    stratify_split = config['STRATIFY_SPLIT']
    shuffle_split = config['SHUFFLE_SPLIT']
    
    # Model storage configuration
    store_coefficients_separately = config['STORE_COEFFICIENTS_SEPARATELY']
    flatten_weights = config['FLATTEN_WEIGHTS']
    
    # =========================================================================
    # LOG CONFIGURATION
    # =========================================================================
    
    print("=" * 60)
    print("MODEL TRAINING CONFIGURATION")
    print("=" * 60)
    print(f"Database: {db_path}")
    print(f"Run ID: {run_id}")
    print(f"Test Size: {test_size}")
    print(f"Random State: {random_state}")
    print("-" * 60)
    print("Logistic Regression Hyperparameters:")
    print(f"  - max_iter: {lr_max_iter}")
    print(f"  - solver: {lr_solver}")
    print(f"  - multi_class: {lr_multi_class}")
    print(f"  - C: {lr_c}")
    print(f"  - penalty: {lr_penalty}")
    print(f"  - tol: {lr_tol}")
    print(f"  - fit_intercept: {lr_fit_intercept}")
    print(f"  - warm_start: {lr_warm_start}")
    print(f"  - n_jobs: {lr_n_jobs}")
    print("-" * 60)
    print(f"Data Processing: min_samples={min_samples_per_class}, stratify={stratify_split}, shuffle={shuffle_split}")
    print("=" * 60)
    
    # =========================================================================
    # LOAD SAMPLES
    # =========================================================================
    
    dataset_repo = DatasetRepository(db_path)

    print(f"\nLoading samples for run_id: {run_id}...")

    samples = dataset_repo.get_run_samples(run_id)
    
    # Parse features and labels
    X = []
    y = []
    
    for i, sample in enumerate(samples):
        X.append(sample['features'])
        y.append(sample['label'])
    
    X = np.array(X)
    y = np.array(y)
    
    # Validate classes
    unique_classes = np.unique(y)
    if len(unique_classes) < 2:
        print(y[:10])
        raise ValueError(f"Need at least 2 classes for classification. Found: {unique_classes}")
    
    for class_value in unique_classes:
        class_count = np.sum(y == class_value)
        if class_count < min_samples_per_class:
            raise ValueError(
                f"Class {class_value} has only {class_count} samples, "
                f"minimum required: {min_samples_per_class}"
            )
    
    print(f"Features shape: {X.shape}")
    print(f"Classes: {unique_classes}")
    print(f"Samples per class: {[np.sum(y == c) for c in unique_classes]}")
    
    # =====================================================================
    # SPLIT DATA
    # =====================================================================
    
    # Prepare split arguments
    split_kwargs = {
        'test_size': test_size,
        'random_state': random_state,
        'shuffle': shuffle_split
    }
    
    # Add stratify if enabled and possible
    if stratify_split:
        # Check if stratify is possible (all classes have at least 2 samples for test/train)
        min_class_samples = min([np.sum(y == c) for c in unique_classes])
        if min_class_samples >= 2:
            split_kwargs['stratify'] = y
        else:
            print(f"Warning: Cannot stratify. Class with minimum samples ({min_class_samples}) < 2")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, **split_kwargs)
    
    print(f"Training samples: {X_train.shape[0]}")
    print(f"Testing samples: {X_test.shape[0]}")
    
    # =====================================================================
    # TRAIN MODEL
    # =====================================================================
    
    print("\nTraining Logistic Regression model...")
        
    model = LogisticRegression(
        max_iter=lr_max_iter,
        random_state=random_state,
        solver=lr_solver,
        # multi_class=lr_multi_class,
        C=lr_c,
        penalty=lr_penalty,
        tol=lr_tol,
        fit_intercept=lr_fit_intercept,
        warm_start=lr_warm_start,
        n_jobs=lr_n_jobs
    )
    
    model.fit(X_train, y_train)
    
    # =====================================================================
    # EVALUATE MODEL
    # =====================================================================
    
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    
    # =====================================================================
    # EXTRACT WEIGHTS
    # =====================================================================
    
    weights = []
    
    if store_coefficients_separately:
        # Store as separate arrays (for future implementation)
        coefficients = model.coef_.tolist() if hasattr(model, 'coef_') else []
        intercept = model.intercept_.tolist() if hasattr(model, 'intercept_') else []
        
        # You can modify ModelRepository to handle separate storage
        # For now, we'll still flatten for backward compatibility
        if flatten_weights:
            for class_idx in range(len(coefficients)):
                weights.extend(coefficients[class_idx])
            weights.extend(intercept)
    else:
        # Flatten weights for storage
        if flatten_weights and hasattr(model, 'coef_'):
            for class_idx in range(model.coef_.shape[0]):
                weights.extend(model.coef_[class_idx].tolist())
            
            if hasattr(model, 'intercept_'):
                weights.extend(model.intercept_.tolist())
    
    print(f"Number of weights extracted: {len(weights)}")
    
    # =====================================================================
    # SAVE MODEL
    # =====================================================================
    
    repo = ModelRepository(db_path)
    model_id = repo.save_model(
        run_id=run_id,
        weights=weights,
        accuracy=float(test_accuracy)
    )
    
    print(f"Model saved with ID: {model_id}")
    
    # =====================================================================
    # RETURN RESULTS
    # =====================================================================
    
    return {
        'run_id': run_id,
        'model_id': model_id,
        'test_accuracy': test_accuracy,
        'train_accuracy': train_accuracy,
        'weights_count': len(weights),
        'features_shape': X.shape,
        'n_samples': len(X),
        'n_classes': len(unique_classes),
        'class_distribution': {int(c): int(np.sum(y == c)) for c in unique_classes},
        'config_used': {
            'db_path': db_path,
            'test_size': test_size,
            'random_state': random_state,
            'lr_max_iter': lr_max_iter,
            'lr_solver': lr_solver,
            'lr_multi_class': lr_multi_class,
            'lr_c': lr_c,
            'lr_penalty': lr_penalty,
            'stratify_used': 'stratify' in split_kwargs
        },
        'model_info': {
            'coef_shape': model.coef_.shape if hasattr(model, 'coef_') else None,
            'intercept_shape': model.intercept_.shape if hasattr(model, 'intercept_') else None,
            'classes': model.classes_.tolist() if hasattr(model, 'classes_') else None,
            'n_features_in_': getattr(model, 'n_features_in_', None)
        }
    }

In [17]:
run_id = 1
    
print(train_model_for_run(run_id))

MODEL TRAINING CONFIGURATION
Database: ../../data/database.sqlite
Run ID: 1
Test Size: 0.2
Random State: 42
------------------------------------------------------------
Logistic Regression Hyperparameters:
  - max_iter: 1000
  - solver: lbfgs
  - multi_class: auto
  - C: 1.0
  - penalty: l2
  - tol: 0.0001
  - fit_intercept: True
  - warm_start: False
  - n_jobs: -1
------------------------------------------------------------
Data Processing: min_samples=1, stratify=True, shuffle=True

Loading samples for run_id: 1...
Features shape: (10506, 24)
Classes: [1 2 3 4]
Samples per class: [np.int64(3610), np.int64(690), np.int64(4497), np.int64(1709)]
Training samples: 8404
Testing samples: 2102

Training Logistic Regression model...
Training Accuracy: 1.0000
Testing Accuracy: 1.0000
Number of weights extracted: 100
Model saved with ID: 1
{'run_id': 1, 'model_id': 1, 'test_accuracy': 1.0, 'train_accuracy': 1.0, 'weights_count': 100, 'features_shape': (10506, 24), 'n_samples': 10506, 'n_class

