In [5]:
# Read data
import pandas as pd
import os

DATA_PATH = "../data/processed"

df_bic = pd.read_csv(os.path.join(DATA_PATH, "BIC_weekly_clean.csv"))
df_bmi = pd.read_csv(os.path.join(DATA_PATH, "BMI_weekly_clean.csv"))
df_bvh = pd.read_csv(os.path.join(DATA_PATH, "BVH_weekly_clean.csv"))
df_mig = pd.read_csv(os.path.join(DATA_PATH, "MIG_weekly_clean.csv"))
df_pgi = pd.read_csv(os.path.join(DATA_PATH, "PGI_weekly_clean.csv"))

df_mig.head()


Unnamed: 0,time,close
0,2017-05-07,6.27
1,2017-05-14,6.0
2,2017-05-21,5.82
3,2017-05-28,6.09
4,2017-06-04,5.51


In [12]:
import os
import logging
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Configure logging
log_dir = "../logs"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"training_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler(),  # Also print to console
    ],
)

logger = logging.getLogger(__name__)

# MLflow Configuration
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000")
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("insurance_weekly_training")
logger.info(f"MLflow tracking URI: {MLFLOW_TRACKING_URI}")
logger.info(f"MLflow experiment: insurance_weekly_training")

# Artifact directory for model/scaler outputs
ARTIFACT_DIR = "../artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
logger.info(f"Artifact directory: {ARTIFACT_DIR}")

DATA_PATH = "../data/processed"
TICKERS = ["BIC", "BMI", "BVH", "MIG", "PGI"]
RANDOM_STATE = 42

logger.info("=" * 60)
logger.info("Starting training pipeline")
logger.info(f"Tickers to train: {TICKERS}")
logger.info(f"Random state: {RANDOM_STATE}")
logger.info(f"Data path: {DATA_PATH}")
logger.info(f"Log file: {log_file}")
logger.info("=" * 60)

results = []
models = {}
scalers = {}

for ticker in TICKERS:
    try:
        logger.info(f"\n{'='*60}")
        logger.info(f"Processing ticker: {ticker}")
        logger.info(f"{'='*60}")
        
        # Load data
        file_path = os.path.join(DATA_PATH, f"{ticker}_weekly_clean.csv")
        logger.info(f"Loading data from: {file_path}")
        
        if not os.path.exists(file_path):
            logger.error(f"File not found: {file_path}")
            continue
            
        df = pd.read_csv(file_path, parse_dates=["time"])
        logger.info(f"Loaded {len(df)} rows for {ticker}")

        # Feature engineering
        logger.info("Creating features...")
        df = df.sort_values("time").reset_index(drop=True)
        df["close_lag1"] = df["close"].shift(1)
        df["close_lag2"] = df["close"].shift(2)
        df["ma_5"] = df["close"].rolling(window=5).mean()
        df["ma_10"] = df["close"].rolling(window=10).mean()
        df["std_5"] = df["close"].rolling(window=5).std()
        
        initial_count = len(df)
        df = df.dropna().reset_index(drop=True)
        dropped_count = initial_count - len(df)
        logger.info(f"Dropped {dropped_count} rows with NaN, remaining: {len(df)} rows")

        feature_cols = ["close_lag1", "close_lag2", "ma_5", "ma_10", "std_5"]
        X = df[feature_cols].values
        y = df["close"].values
        logger.info(f"Feature matrix shape: {X.shape}, Target shape: {y.shape}")

        # Train/Val/Test split
        logger.info("Splitting data into train/val/test...")
        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, shuffle=False, random_state=RANDOM_STATE,
        )

        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=0.5, shuffle=False, random_state=RANDOM_STATE,
        )
        
        logger.info(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

        # Normalize with StandardScaler (fit on train only)
        logger.info("Fitting StandardScaler on training data...")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_test_scaled = scaler.transform(X_test)
        logger.info("Scaling completed")

        # Train model
        logger.info("Training LinearRegression model...")
        model = LinearRegression()
        model.fit(X_train_scaled, y_train)
        logger.info("Model training completed")
        
        # Evaluate on test set
        logger.info("Evaluating on test set...")
        y_pred = model.predict(X_test_scaled)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        logger.info(f"Metrics for {ticker}:")
        logger.info(f"  MAE:  {mae:.6f}")
        logger.info(f"  RMSE: {rmse:.6f}")
        logger.info(f"  R¬≤:   {r2:.6f}")

        results.append({
            "ticker": ticker,
            "n_samples": len(df),
            "n_train": len(X_train),
            "n_val": len(X_val),
            "n_test": len(X_test),
            "MAE": mae,
            "RMSE": rmse,
            "R2": r2,
        })

        models[ticker] = model
        scalers[ticker] = scaler

        # --- MLflow Tracking ---
        logger.info("Logging to MLflow...")
        try:
            import joblib
            
            run_name = f"linear_{ticker}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
            
            with mlflow.start_run(run_name=run_name):
                # Set tags
                mlflow.set_tag("feature_engineering", "lag+rolling")
                mlflow.set_tag("model_type", "LinearRegression")
                mlflow.set_tag("ticker", ticker)
                
                # Log parameters
                mlflow.log_param("ticker", ticker)
                mlflow.log_param("n_samples", len(df))
                mlflow.log_param("n_train", len(X_train))
                mlflow.log_param("n_val", len(X_val))
                mlflow.log_param("n_test", len(X_test))
                mlflow.log_param("random_state", RANDOM_STATE)
                mlflow.log_param("features", ",".join(feature_cols))
                
                # Log metrics
                mlflow.log_metric("MAE", mae)
                mlflow.log_metric("RMSE", rmse)
                mlflow.log_metric("R2", r2)
                
                # Save and log model artifact
                model_path = os.path.join(ARTIFACT_DIR, f"{ticker}_linear.joblib")
                joblib.dump(model, model_path)
                mlflow.log_artifact(model_path, artifact_path="artifacts")
                mlflow.sklearn.log_model(model, "model")
                logger.info(f"Model saved to: {model_path}")
                
                # Save and log scaler artifact
                scaler_path = os.path.join(ARTIFACT_DIR, f"{ticker}_scaler.joblib")
                joblib.dump(scaler, scaler_path)
                mlflow.log_artifact(scaler_path, artifact_path="artifacts")
                logger.info(f"Scaler saved to: {scaler_path}")
                
            logger.info(f"MLflow logging completed for {ticker} (run: {run_name})")
        except Exception as mlflow_error:
            logger.warning(f"MLflow logging failed for {ticker}: {str(mlflow_error)}")
            logger.warning("Continuing without MLflow logging...")
            
        logger.info(f"‚úì Successfully completed training for {ticker}")
        
    except Exception as e:
        logger.error(f"‚úó Error processing {ticker}: {str(e)}", exc_info=True)
        continue

results_df = pd.DataFrame(results)
print(results_df)

logger.info("\n" + "=" * 60)
logger.info("Training pipeline completed")
logger.info(f"Successfully trained {len(results)} tickers")
logger.info("=" * 60)
logger.info("\nFinal Results Summary:")
for _, row in results_df.iterrows():
    logger.info(f"  {row['ticker']}: MAE={row['MAE']:.4f}, RMSE={row['RMSE']:.4f}, R¬≤={row['R2']:.4f}")
logger.info(f"\nLog file saved to: {log_file}")


2025/11/27 01:32:30 INFO mlflow.tracking.fluent: Experiment with name 'insurance_weekly_training' does not exist. Creating a new experiment.
2025-11-27 01:32:33,399 - __main__ - INFO - MLflow tracking URI: http://localhost:5000
2025-11-27 01:32:33,403 - __main__ - INFO - MLflow experiment: insurance_weekly_training
2025-11-27 01:32:33,407 - __main__ - INFO - Artifact directory: ../artifacts
2025-11-27 01:32:33,415 - __main__ - INFO - Starting training pipeline
2025-11-27 01:32:33,421 - __main__ - INFO - Tickers to train: ['BIC', 'BMI', 'BVH', 'MIG', 'PGI']
2025-11-27 01:32:33,424 - __main__ - INFO - Random state: 42
2025-11-27 01:32:33,428 - __main__ - INFO - Data path: ../data/processed
2025-11-27 01:32:33,429 - __main__ - INFO - Log file: ../logs/training_20251127_013217.log
2025-11-27 01:32:33,434 - __main__ - INFO - 
2025-11-27 01:32:33,436 - __main__ - INFO - Processing ticker: BIC
2025-11-27 01:32:33,446 - __main__ - INFO - Loading data from: ../data/processed/BIC_weekly_clean.cs

üèÉ View run linear_BIC_20251127_013233 at: http://localhost:5000/#/experiments/1/runs/0683e0a252c340d991d6d4504dfa0fb7
üß™ View experiment at: http://localhost:5000/#/experiments/1


2025-11-27 01:33:15,034 - __main__ - INFO - ‚úì Successfully completed training for BIC
2025-11-27 01:33:15,036 - __main__ - INFO - 
2025-11-27 01:33:15,041 - __main__ - INFO - Processing ticker: BMI
2025-11-27 01:33:15,053 - __main__ - INFO - Loading data from: ../data/processed/BMI_weekly_clean.csv
2025-11-27 01:33:15,079 - __main__ - INFO - Loaded 574 rows for BMI
2025-11-27 01:33:15,083 - __main__ - INFO - Creating features...
2025-11-27 01:33:15,097 - __main__ - INFO - Dropped 9 rows with NaN, remaining: 565 rows
2025-11-27 01:33:15,102 - __main__ - INFO - Feature matrix shape: (565, 5), Target shape: (565,)
2025-11-27 01:33:15,104 - __main__ - INFO - Splitting data into train/val/test...
2025-11-27 01:33:15,106 - __main__ - INFO - Train: 395, Val: 85, Test: 85
2025-11-27 01:33:15,109 - __main__ - INFO - Fitting StandardScaler on training data...
2025-11-27 01:33:15,113 - __main__ - INFO - Scaling completed
2025-11-27 01:33:15,116 - __main__ - INFO - Training LinearRegression mode

üèÉ View run linear_BMI_20251127_013315 at: http://localhost:5000/#/experiments/1/runs/d99089b6356240a5a9a5906ac601bcdf
üß™ View experiment at: http://localhost:5000/#/experiments/1


2025-11-27 01:33:56,384 - __main__ - INFO - ‚úì Successfully completed training for BMI
2025-11-27 01:33:56,386 - __main__ - INFO - 
2025-11-27 01:33:56,390 - __main__ - INFO - Processing ticker: BVH
2025-11-27 01:33:56,396 - __main__ - INFO - Loading data from: ../data/processed/BVH_weekly_clean.csv
2025-11-27 01:33:56,415 - __main__ - INFO - Loaded 574 rows for BVH
2025-11-27 01:33:56,419 - __main__ - INFO - Creating features...
2025-11-27 01:33:56,438 - __main__ - INFO - Dropped 9 rows with NaN, remaining: 565 rows
2025-11-27 01:33:56,446 - __main__ - INFO - Feature matrix shape: (565, 5), Target shape: (565,)
2025-11-27 01:33:56,450 - __main__ - INFO - Splitting data into train/val/test...
2025-11-27 01:33:56,460 - __main__ - INFO - Train: 395, Val: 85, Test: 85
2025-11-27 01:33:56,462 - __main__ - INFO - Fitting StandardScaler on training data...
2025-11-27 01:33:56,469 - __main__ - INFO - Scaling completed
2025-11-27 01:33:56,474 - __main__ - INFO - Training LinearRegression mode

üèÉ View run linear_BVH_20251127_013356 at: http://localhost:5000/#/experiments/1/runs/ed7489c444ce4b84ac3a2950f5da9ae1
üß™ View experiment at: http://localhost:5000/#/experiments/1


2025-11-27 01:34:39,116 - __main__ - INFO - ‚úì Successfully completed training for BVH
2025-11-27 01:34:39,118 - __main__ - INFO - 
2025-11-27 01:34:39,120 - __main__ - INFO - Processing ticker: MIG
2025-11-27 01:34:39,124 - __main__ - INFO - Loading data from: ../data/processed/MIG_weekly_clean.csv
2025-11-27 01:34:39,138 - __main__ - INFO - Loaded 447 rows for MIG
2025-11-27 01:34:39,140 - __main__ - INFO - Creating features...
2025-11-27 01:34:39,154 - __main__ - INFO - Dropped 9 rows with NaN, remaining: 438 rows
2025-11-27 01:34:39,157 - __main__ - INFO - Feature matrix shape: (438, 5), Target shape: (438,)
2025-11-27 01:34:39,162 - __main__ - INFO - Splitting data into train/val/test...
2025-11-27 01:34:39,163 - __main__ - INFO - Train: 306, Val: 66, Test: 66
2025-11-27 01:34:39,165 - __main__ - INFO - Fitting StandardScaler on training data...
2025-11-27 01:34:39,170 - __main__ - INFO - Scaling completed
2025-11-27 01:34:39,173 - __main__ - INFO - Training LinearRegression mode

üèÉ View run linear_MIG_20251127_013439 at: http://localhost:5000/#/experiments/1/runs/5a8544b4e2274bffb34e541fca60d963
üß™ View experiment at: http://localhost:5000/#/experiments/1


2025-11-27 01:35:21,360 - __main__ - INFO - ‚úì Successfully completed training for MIG
2025-11-27 01:35:21,361 - __main__ - INFO - 
2025-11-27 01:35:21,362 - __main__ - INFO - Processing ticker: PGI
2025-11-27 01:35:21,367 - __main__ - INFO - Loading data from: ../data/processed/PGI_weekly_clean.csv
2025-11-27 01:35:21,377 - __main__ - INFO - Loaded 574 rows for PGI
2025-11-27 01:35:21,378 - __main__ - INFO - Creating features...
2025-11-27 01:35:21,386 - __main__ - INFO - Dropped 9 rows with NaN, remaining: 565 rows
2025-11-27 01:35:21,390 - __main__ - INFO - Feature matrix shape: (565, 5), Target shape: (565,)
2025-11-27 01:35:21,392 - __main__ - INFO - Splitting data into train/val/test...
2025-11-27 01:35:21,396 - __main__ - INFO - Train: 395, Val: 85, Test: 85
2025-11-27 01:35:21,398 - __main__ - INFO - Fitting StandardScaler on training data...
2025-11-27 01:35:21,402 - __main__ - INFO - Scaling completed
2025-11-27 01:35:21,403 - __main__ - INFO - Training LinearRegression mode

üèÉ View run linear_PGI_20251127_013521 at: http://localhost:5000/#/experiments/1/runs/712a261752c444c8b5f36b3e8ef28f44
üß™ View experiment at: http://localhost:5000/#/experiments/1


2025-11-27 01:36:02,785 - __main__ - INFO - ‚úì Successfully completed training for PGI
2025-11-27 01:36:02,804 - __main__ - INFO - 
2025-11-27 01:36:02,811 - __main__ - INFO - Training pipeline completed
2025-11-27 01:36:02,815 - __main__ - INFO - Successfully trained 5 tickers
2025-11-27 01:36:02,822 - __main__ - INFO - 
Final Results Summary:
2025-11-27 01:36:02,828 - __main__ - INFO -   BIC: MAE=0.5564, RMSE=0.8151, R¬≤=0.9437
2025-11-27 01:36:02,835 - __main__ - INFO -   BMI: MAE=0.3207, RMSE=0.4298, R¬≤=0.7319
2025-11-27 01:36:02,836 - __main__ - INFO -   BVH: MAE=1.3918, RMSE=1.9914, R¬≤=0.8714
2025-11-27 01:36:02,838 - __main__ - INFO -   MIG: MAE=0.3637, RMSE=0.4568, R¬≤=0.7956
2025-11-27 01:36:02,842 - __main__ - INFO -   PGI: MAE=0.4307, RMSE=0.6144, R¬≤=0.7656
2025-11-27 01:36:02,844 - __main__ - INFO - 
Log file saved to: ../logs/training_20251127_013217.log


  ticker  n_samples  n_train  n_val  n_test       MAE      RMSE        R2
0    BIC        565      395     85      85  0.556352  0.815137  0.943703
1    BMI        565      395     85      85  0.320682  0.429775  0.731919
2    BVH        565      395     85      85  1.391783  1.991375  0.871419
3    MIG        438      306     66      66  0.363671  0.456772  0.795621
4    PGI        565      395     85      85  0.430677  0.614443  0.765608
