In [1]:
import os 
%pwd

'd:\\AIprojects\\DataScience_EndToEnd\\research'

In [2]:
os.chdir("D:\AIprojects\DataScience_EndToEnd")
%pwd

'D:\\AIprojects\\DataScience_EndToEnd'

In [28]:
from dataclasses import dataclass
from pathlib import Path

@dataclass

class ModelTrainerConfig:

    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    n_estimators: int
    max_depth: int
    min_samples_split: int
    min_samples_leaf: int
    max_features: str
    target_column: str

In [4]:
from src.datascience.constants import *
from src.datascience.utils.common import read_yaml,create_directories

In [29]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config=read_yaml(config_filepath)
        self.params=read_yaml(params_filepath)
        self.schema=read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config=self.config.model_trainer
        params=self.params.random_forest
        schema=self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config=ModelTrainerConfig(root_dir=config.root_dir,train_data_path=config.train_data_path,test_data_path=config.test_data_path,model_name=config.model_name,n_estimators=params.n_estimators, max_depth=params.max_depth,min_samples_split=params.min_samples_split,min_samples_leaf=params.min_samples_leaf,max_features=params.max_features,target_column=schema.name)
        
        return model_trainer_config




In [6]:
import pandas as pd
import os
from src.datascience import logger
from sklearn.linear_model import ElasticNet
import joblib

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import numpy as np
train_data = pd.read_csv("artifacts/data_transformation/train.csv")
test_data = pd.read_csv("artifacts/data_transformation/test.csv")
train_x = train_data.drop(['quality'], axis=1)
test_x = test_data.drop(['quality'], axis=1)
train_y = train_data[['quality']]
test_y = test_data[['quality']]
models = {
    "Linear Regression": LinearRegression(),
    "ElasticNet": ElasticNet(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": xgb.XGBRegressor(),
    "SVR": SVR(),
    "k-NN Regressor": KNeighborsRegressor()
}

results = {}

for name, model in models.items():
    model.fit(train_x, train_y)
    preds = model.predict(test_x)
    r2 = r2_score(test_y, preds)
    rmse = np.sqrt(mean_squared_error(test_y, preds))
    results[name] = {"R²": round(r2, 3), "RMSE": round(rmse, 3)}

# Display results
print("📊 Model Performance:\n")
for model_name, scores in results.items():
    print(f"{model_name}: R² = {scores['R²']}, RMSE = {scores['RMSE']}")
best_model = max(results.items(), key=lambda item: item[1]["R²"])

print(f"\n🏆 Best Model: {best_model[0]}")
print(f"👉 R² = {best_model[1]['R²']}, RMSE = {best_model[1]['RMSE']}")



  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


📊 Model Performance:

Linear Regression: R² = 0.403, RMSE = 0.625
ElasticNet: R² = 0.008, RMSE = 0.805
Random Forest: R² = 0.522, RMSE = 0.559
Gradient Boosting: R² = 0.447, RMSE = 0.601
XGBoost: R² = 0.462, RMSE = 0.593
SVR: R² = 0.185, RMSE = 0.73
k-NN Regressor: R² = 0.186, RMSE = 0.729

🏆 Best Model: Random Forest
👉 R² = 0.522, RMSE = 0.559


  y = column_or_1d(y, warn=True)


In [30]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        train_data = pd.read_csv(self.config.train_data_path)
        test_data = pd.read_csv(self.config.test_data_path)


        train_x = train_data.drop([self.config.target_column], axis=1)
        test_x = test_data.drop([self.config.target_column], axis=1)
        train_y = train_data[[self.config.target_column]]
        test_y = test_data[[self.config.target_column]]


        lr = RandomForestRegressor(n_estimators=self.config.n_estimators, max_depth=self.config.max_depth,min_samples_split=self.config.min_samples_split,min_samples_leaf=self.config.min_samples_leaf,max_features=self.config.max_features, random_state=42)
        lr.fit(train_x, train_y)

        joblib.dump(lr, os.path.join(self.config.root_dir, self.config.model_name))


    

In [31]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.train()
except Exception as e:
    raise e

[2025-07-04 20:55:01,285]: INFO: yaml file: config\config.yaml loaded successfully]
[2025-07-04 20:55:01,288]: INFO: yaml file: params.yaml loaded successfully]
[2025-07-04 20:55:01,294]: INFO: yaml file: schema.yaml loaded successfully]
[2025-07-04 20:55:01,296]: INFO: created directory at: artifacts]
[2025-07-04 20:55:01,298]: INFO: created directory at: artifacts/model_trainer]


  return fit_method(estimator, *args, **kwargs)
