In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import logging
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_squared_error
from helpers.config import load_config
from helpers.logger import logger


class ModelTrainer:
    def __init__(self,config):
        self.config = config
        
    def fetch_data(self):
        """Training and testing processed data"""
        try:
            # training and testing scaled path
            df_train_scaled = pd.read_csv(self.config['train_processed'],delimiter=",")
            df_test_scaled = pd.read_csv(self.config['test_processed'],delimiter=",")
            #target variable for y
            y_train_df = pd.read_csv(self.config['train_target_raw'],delimiter=",")
            y_test_df = pd.read_csv(self.config['test_target_raw'],delimiter=",")
            return df_train_scaled,df_test_scaled,y_train_df,y_test_df
        except Exception as e:
            logger.exception(f'Could not load data: {e}')
            raise
        
    def load_models(self):
        """models from training and test data"""
        try:
            models = {
                "Linear Regression":LinearRegression(),
                "Ridge Regressor":Ridge(),
                "Lasso Regressor": Lasso(),
                "Random Forest Regressor": RandomForestRegressor(),
                "XGB Regressor": XGBRegressor(),
                "SVM Regressor":SVR(),
                "Gradient Boosting Regressor":GradientBoostingRegressor()
            }
            return models
        except Exception as e:
            logger.exception(f"error loading models")
            raise
        

        
    def log_into_mlflow(self):
        """Log results into mlflow"""
        mlflow.set_experiment("utils pipeline")
        models = self.load_models()
        
        # load in data
        df_train_scaled = pd.read_csv(self.config['train_processed'],delimiter=",")
        df_test_scaled = pd.read_csv(self.config['test_processed'],delimiter=",")
        y_train_df = pd.read_csv(self.config['train_target_raw'],delimiter=",")
        y_test_df = pd.read_csv(self.config['test_target_raw'],delimiter=",")
        with mlflow.start_run():
            for model_name,model in models.items():
                model.fit(df_train_scaled,y_train_df)
                pred = model.predict(df_test_scaled)
                r2 = r2_score(y_test_df,pred)
                mse = mean_squared_error(y_test_df,pred)
                print(f'Model Name: {model_name}')
                print(f'R2 Score: {r2*100:.2f}')
                print(f'Mean Squared Error: {mse:.4f}')
                mlflow.log_metric("train score",model.score(df_train_scaled,y_train_df))
                mlflow.log_metric("test score",model.score(df_test_scaled,y_test_df))
                mlflow.log_metric("r2 score",r2)
                mlflow.log_metric("mean squared error score",mse)
                mlflow.sklearn.log_model("model name",model_name)
                
                

    
                
                
            
if __name__ == "__main__":
    config = load_config()
    model_trainer_config = ModelTrainer(config)
    model_trainer_config.fetch_data()
    model_trainer_config.load_models()
    model_trainer_config.log_into_mlflow()


[2025-10-10 18:00:03,636: DEBUG: cmd: Popen(['git', 'version'], cwd=/home/nickkats1/sklearn_only/utilization, stdin=None, shell=False, universal_newlines=False)]
[2025-10-10 18:00:03,638: DEBUG: cmd: Popen(['git', 'version'], cwd=/home/nickkats1/sklearn_only/utilization, stdin=None, shell=False, universal_newlines=False)]
[2025-10-10 18:00:03,641: DEBUG: util: sys.platform='linux', git_executable='git']
[2025-10-10 18:00:03,642: DEBUG: cmd: Popen(['git', 'check-ignore', '/home/nickkats1/sklearn_only/utilization/venv/lib/python3.12/site-packages'], cwd=/home/nickkats1/sklearn_only, stdin=None, shell=False, universal_newlines=False)]




Model Name: Linear Regression
R2 Score: 77.77
Mean Squared Error: 0.0565
[2025-10-10 18:00:03,756: DEBUG: connectionpool: https://api.mlflow-telemetry.io:443 "POST /log HTTP/1.1" 200 30]




Model Name: Ridge Regressor
R2 Score: 77.78
Mean Squared Error: 0.0565




Model Name: Lasso Regressor
R2 Score: -6.07
Mean Squared Error: 0.2697


  return fit_method(estimator, *args, **kwargs)


Model Name: Random Forest Regressor
R2 Score: 98.76
Mean Squared Error: 0.0031




Model Name: XGB Regressor
R2 Score: 98.48
Mean Squared Error: 0.0039


  y = column_or_1d(y, warn=True)


Model Name: SVM Regressor
R2 Score: 87.97
Mean Squared Error: 0.0306


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Model Name: Gradient Boosting Regressor
R2 Score: 98.70
Mean Squared Error: 0.0033
[2025-10-10 18:00:13,782: DEBUG: connectionpool: Starting new HTTPS connection (1): api.mlflow-telemetry.io:443]




[2025-10-10 18:00:15,527: DEBUG: connectionpool: https://api.mlflow-telemetry.io:443 "POST /log HTTP/1.1" 200 30]
[2025-10-10 18:00:24,586: DEBUG: connectionpool: Starting new HTTPS connection (1): api.mlflow-telemetry.io:443]
[2025-10-10 18:00:26,090: DEBUG: connectionpool: https://api.mlflow-telemetry.io:443 "POST /log HTTP/1.1" 429 31]
[2025-10-10 18:00:27,096: DEBUG: connectionpool: Starting new HTTPS connection (1): api.mlflow-telemetry.io:443]
