In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import numpy as np
import logging
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_squared_error
from utils.config import load_config

logger = logging.getLogger(__name__)

class ModelTrainer:
    def __init__(self,config):
        self.config = config
        
    def fetch_data(self):
        """Training and testing processed data"""
        try:
            # training and testing scaled path
            self.df_train_scaled = pd.read_csv(self.config['train_processed'],delimiter=",")
            self.df_test_scaled = pd.read_csv(self.config['test_processed'],delimiter=",")
            #target variable for y
            self.y_train_df = pd.read_csv(self.config['train_target_raw'],delimiter=",")
            self.y_test_df = pd.read_csv(self.config['test_target_raw'],delimiter=",")
            return self.df_train_scaled,self.df_test_scaled,self.y_train_df,self.y_test_df
        except Exception as e:
            logger.exception(f'Could not load data: {e}')
            raise
        
    def load_models(self):
        """models from training and test data"""
        try:
            self.models = {
                "Linear Regression":LinearRegression(),
                "Ridge Regressor":Ridge(),
                "Lasso Regressor": Lasso(),
                "Random Forest Regressor": RandomForestRegressor(),
                "XGB Regressor": XGBRegressor(),
                "SVM Regressor":SVR(),
                "Gradient Boosting Regressor":GradientBoostingRegressor()
            }
            return self.models
        except Exception as e:
            logger.exception(f"error loading models")
            raise
        

        
    def log_into_mlflow(self):
        """Log results into mlflow"""
        mlflow.set_experiment("utils pipeline")
        with mlflow.start_run():
            for model_name,model in self.models.items():
                model.fit(self.df_train_scaled,self.y_train_df)
                self.pred = model.predict(self.df_test_scaled)
                self.r2 = r2_score(self.y_test_df,self.pred)
                self.mse = mean_squared_error(self.y_test_df,self.pred)
                print(f'Model Name: {model_name}')
                print(f'R2 Score: {self.r2*100:.2f}')
                print(f'Mean Squared Error: {self.mse:.4f}')
                mlflow.log_metric("train score",model.score(self.df_train_scaled,self.y_train_df))
                mlflow.log_metric("test score",model.score(self.df_test_scaled,self.y_test_df))
                mlflow.log_metric("r2 score",self.r2)
                mlflow.log_metric("mean squared error score",self.mse)
                mlflow.sklearn.log_model("model name",model_name)
                
                

    
                
                
            
if __name__ == "__main__":
    config = load_config()
    model_trainer_config = ModelTrainer(config)
    model_trainer_config.fetch_data()
    model_trainer_config.load_models()
    model_trainer_config.log_into_mlflow()




Model Name: Linear Regression
R2 Score: 77.77
Mean Squared Error: 0.0565




Model Name: Ridge Regressor
R2 Score: 77.78
Mean Squared Error: 0.0565




Model Name: Lasso Regressor
R2 Score: -6.07
Mean Squared Error: 0.2697


  return fit_method(estimator, *args, **kwargs)


Model Name: Random Forest Regressor
R2 Score: 98.97
Mean Squared Error: 0.0026




Model Name: XGB Regressor
R2 Score: 98.48
Mean Squared Error: 0.0039


  y = column_or_1d(y, warn=True)


Model Name: SVM Regressor
R2 Score: 87.97
Mean Squared Error: 0.0306


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Model Name: Gradient Boosting Regressor
R2 Score: 98.69
Mean Squared Error: 0.0033




In [2]:
df_train_scaled = pd.read_csv("data/processed/train.csv",delimiter=",")
df_test_scaled = pd.read_csv(config['test_processed'],delimiter=",")
y_train_df = pd.read_csv(config['train_target_raw'],delimiter=",")
y_test_df = pd.read_csv(config['test_target_raw'],delimiter=",")

best_model = GradientBoostingRegressor(learning_rate=0.1,max_depth=4,min_samples_split=10,n_estimators=200)
best_model.fit(df_train_scaled,y_train_df)
pred = best_model.predict(df_test_scaled)
r2 = r2_score(y_test_df,pred)
print(f'R2 score of best model: {r2*100:.2f}%')
mse = mean_squared_error(y_test_df,pred)
print(f'MSE Best Model: {mse:.4f}')


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


R2 score of best model: 99.16%
MSE Best Model: 0.0021
