In [75]:
import os
%pwd  # this tell us which path we are currently working , so based on the below output path we are working under the research file
os.chdir("c:\\datascience End to End Projects\\steel-plant-Load-Prediction")  #  but i would like to work with main ProjectML_with_MLFlow file , so for getting i step back in path inorder to enter the main project file i used this command os.chdir("../")
%pwd

'c:\\datascience End to End Projects\\steel-plant-Load-Prediction'

In [76]:
# Now iam creating the entity class which consist of config.yaml folder model trainer code part variables, along with that iam adding some more varaibles like alpha,l1_ratio,target_column inside my entity class
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    target_column: str  # this target column is present inside the Schema.yaml file which it tells us the quality of the Wine based on the value it returns

In [77]:
from PROJECTML.constants import *
from PROJECTML.utils.common import read_yaml, create_directories
from PROJECTML import logger

In [78]:
# this template we use for every stage like data_ingestion,data_validation,data_transformation, model trainer .. etc
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

# this is part of code for the Model trainerConfig which helps us to return the configuration
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer   # here iam reading the schema, params 
        #params = self.params.ElasticNet
        schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path,
            model_name = config.model_name,
            #alpha = params.alpha,    # here from params iam taking the alpha l1_ratio
            #l1_ratio = params.l1_ratio, 
            target_column = schema.name # here from schema iam taking the name which i will return through target_column
            
        )

        return model_trainer_config # here iam returning all variables from the configuration

In [79]:
#!pip install evidently
import pandas as pd
import os
from PROJECTML import logger
import joblib # here iam saving the model because i want to save the data
from sklearn.model_selection import train_test_split
from src.PROJECTML.config.configuration import ConfigurationManager
from src.PROJECTML.components.data_transformation import DataTransformation
from sklearn.metrics import accuracy_score
import pickle
#from PROJECTML.entity.config_entity import ModelTrainerConfig
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import  ExtraTreesClassifier
import numpy as np
from IPython.display import display

In [80]:
# imported necessary libraries for model monitoring with evidently ai
from datetime import datetime, time
from sklearn import datasets, ensemble

#!pip install evidently
import evidently
evidently.__version__
from evidently.test_suite import TestSuite
#from evidently.mertics import *
from evidently import ColumnMapping
from evidently.report import Report  # here we are importing reports which helps us to give reports of model performance w.r.t time  and data
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, ClassificationPreset # evidently has 3 evaluation metric_preset which are DataDriftPreset, TargetDriftPreset, ClassificationPreset  so each thing has its importance , this DataDriftPreset parameter helps us to make specifically understand the DataDriftPreset, TargetDriftPreset this helps us to understand whether the target variable has changed or not,ClassificationPreset is for we are dealing with classification problem statement what performance meterics we have to monitor our model so we have precision recall f1 score confusion metrics accuracy so by importing this ClassificationPreset we can montior our model  based on whetehr regarding different performance metircs are changing w.r.t time and data 
#from evidently.ui.dashboard import Dashboard
#from evidently.tabs import ClassificationPerformanceTab
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import DataQualityPreset
from evidently.metric_preset import TargetDriftPreset
from evidently.metric_preset import ClassificationPreset
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

classification_performance_report = Report(metrics=[
    ClassificationPreset(),
])


multiclass_cat_target_drift_report = Report(metrics=[
    TargetDriftPreset(),
])

data_quality_report = Report(metrics=[
    DataQualityPreset(),
])



In [54]:
import logging

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config
        self.logger = logging.getLogger(__name__)

    def train(self):
        self.train_data = pd.read_csv(self.config.train_data_path)
        self.test_data = pd.read_csv(self.config.test_data_path)
        self.x_train = self.train_data.drop(columns=['Load_Type'])
        self.logger.info(f'Training data loaded: Columns - {self.x_train.columns}')
        self.y_train = self.train_data['Load_Type']
        self.x_test = self.test_data.drop(columns=['Load_Type'])
        self.y_test = self.test_data['Load_Type']

        return self.train_data

    def model_monitering(self):  # Added current as an argument
        self.logger.info('Starting model monitoring...')
        self.target = 'Load_Type'  # target variable
        self.prediction = 'prediction'  # Recording the prediction values so for that i have create a  column name prediction
        # Reference data means historical data we train the model ,Current data means upcoming or new data that we are going to train the model
        # All features are numerical (assuming this is correct)
        self.numerical_features = ['WeekStatus_Weekday', 'WeekStatus_Weekend', 'Usage_kWh',
                                  'Lagging_Reactive_Power_kVarh', 'Leading_Reactive_Power_kVarh',
                                  'CO2', 'Lagging_Power_Factor', 'Leading_Power_Factor', 'NSM', 'hour']
        #self.categorical_features = []  # Empty list since no categorical features

        self.logger.info('Training the model...')
        # Train the ExtraTreesClassifier model
        model = ExtraTreesClassifier()
        model.fit(self.x_train, self.y_train)

        self.logger.info('Making predictions on test data...')
        # Make predictions on current data
        self.train_data['prediction'] = model.predict(self.x_train)
        self.test_data['prediction'] = model.predict(self.x_test)
        #print(self.test_data)

        self.logger.info('Evaluating model performance...')
        # Model performance evaluation
        column_mapping = ColumnMapping()
        column_mapping.target = self.target
        column_mapping.prediction = self.prediction
        column_mapping.numerical_features = self.numerical_features
        classification_performance_report.run(reference_data=self.train_data, current_data=self.test_data, column_mapping=column_mapping)  # i passed reference data as trained data with i made prediction and store them in this trained traine_data and i have consider the test_data as upcoming data or current data which i made prediction so i passed these train and test data as historical data and upcoming data then we can observe the data drift and model drift that how our model and data is getting flutuate by we monitor that thing by this evidently ai tool by report
        data_drift_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)
        data_quality_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)
        multiclass_cat_target_drift_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)
        
        data_drift_report.save_html("data_drift_file.html")
        classification_performance_report.save_html('Classification report.html')
        data_quality_report.save_html("Data_quality_report.html")
        multiclass_cat_target_drift_report.save_html("target_drift_report.html")





In [69]:
class ModelTrainer:
    def __init__(self, config:ModelTrainerConfig):
        self.config = config

    def train(self):
        self.train_data = pd.read_csv(self.config.train_data_path)
        self.test_data = pd.read_csv(self.config.test_data_path)

        self.x_train = self.train_data.drop(columns=['Load_Type'])
        print(f'this is the self.x_train dataset {self.x_train.columns}')
        self.y_train = self.train_data['Load_Type']
        self.x_test = self.test_data.drop(columns=['Load_Type'])
        self.y_test = self.test_data['Load_Type']

    def model_monitering(self):  # Added current as an argument
        
        self.target = 'Load_Type'  
        self.prediction = 'prediction'  
        self.numerical_features = ['WeekStatus_Weekday', 'WeekStatus_Weekend', 'Usage_kWh',
                                  'Lagging_Reactive_Power_kVarh', 'Leading_Reactive_Power_kVarh',
                                  'CO2', 'Lagging_Power_Factor', 'Leading_Power_Factor', 'NSM', 'hour']
        #self.categorical_features = []  # Empty list since no categorical features

        
        # Train the ExtraTreesClassifier model
        self.model = ExtraTreesClassifier()
        self.model.fit(self.x_train, self.y_train)

        
        # Make predictions on current data
        self.train_data['prediction'] = self.model.predict(self.x_train) 
        self.test_data['prediction'] = self.model.predict(self.x_test)  

        self.train_pred=self.train_data['prediction']
        self.test_pred=self.test_data['prediction']
        #print(self.test_data)

        # Model performance evaluation
        column_mapping = ColumnMapping()
        column_mapping.target = self.target
        column_mapping.prediction = self.prediction
        column_mapping.numerical_features = self.numerical_features
        classification_performance_report.run(reference_data=self.train_data, current_data=self.test_data, column_mapping=column_mapping)  # i passed reference data as trained data with i made prediction and store them in this trained traine_data and i have consider the test_data as upcoming data or current data which i made prediction so i passed these train and test data as historical data and upcoming data then we can observe the data drift and model drift that how our model and data is getting flutuate by we monitor that thing by this evidently ai tool by report
        data_drift_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)
        data_quality_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)
        multiclass_cat_target_drift_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)
        

        



        data_drift_report.save_html("data_drift_file.html")
        classification_performance_report.save_html('Classification report.html')
        data_quality_report.save_html("Data_quality_report.html")
        multiclass_cat_target_drift_report.save_html("target_drift_report.html")



    def evaluate_model()(self):
        model=self.model
        train_accuracy = accuracy_score(self.y_train, self.train_pred)
        test_accuracy = accuracy_score(self.y_test, self.test_pred)

        train_cm = confusion_matrix(self.y_train, self.train_pred)
        test_cm = confusion_matrix(self.y_test, self.test_pred)

        train_precision = precision_score(self.y_train, self.train_pred, average='weighted')
        test_precision = precision_score(self.y_test, self.test_pred, average='weighted')

        train_recall = recall_score(self.y_train, self.train_pred, average='weighted')
        test_recall = recall_score(self.y_test, self.test_pred, average='weighted')

        train_f1 = f1_score(self.y_train, self.train_pred, average='weighted')
        test_f1 = f1_score(self.y_test, self.test_pred, average='weighted')

        scores={
                'Training Accuracy': train_accuracy,
                'Testing Accuracy': test_accuracy,
                'Training Precision': train_precision,
                'Testing Precision': test_precision,
                'Training Recall': train_recall,
                'Testing Recall': test_recall,
                'Training F1-score': train_f1,
                'Testing F1-score': test_f1
                }

        for metric, value in scores.items():
            print(f"{metric}: {value}")

        joblib.dump(model, os.path.join(self.config.root_dir, f"{type(model).__name__}_model.joblib"))


                # Load the trained model  and test model 
        model = joblib.load("artifacts\model_trainer\ExtraTreesClassifier_model.joblib")  # Replace "path_to_saved_model.pkl" with the actual path

            #self.preprocessor = joblib.load('artifacts\data_transformation\categorical_preprocessor_obj.joblib')
            # Prepare input data for prediction (a single sample row)
            # Replace the feature values with the values of your unseen test data
        single_sample = {
            'Usage_kWh': 8.46,
            'Lagging_Reactive_Power_kVarh': 0,
            'Leading_Reactive_Power_kVarh': 25.92,
            'CO2': 0,
            'Lagging_Power_Factor': 100,
            'Leading_Power_Factor': 31.03,
            'NSM': 45000,
            'WeekStatus_Weekday': 1,
            'WeekStatus_Weekend': 0,
            'hour': 20
            
        }
                    
        #8.46,0,25.92,0,100,31.03,45000,Weekday,Tuesday,Medium_Load

        #40.25,8.82,0.5,0,97.68,99.99,67500,Weekday,Tuesday,Maximum_Load

        # Convert the dictionary to a DataFrame
        input_data = pd.DataFrame([single_sample])
        #preprocessed_input_data = self.preprocessor.transform(input_data)

        # Ensure that the columns of input_data match the order of features used during training
        # You might need to rearrange the columns or add missing columns
        input_data = input_data[self.x_train.columns]

        # Perform prediction
        prediction = model.predict(input_data)

        print("Predicted class label:", prediction)




        return pd.DataFrame(scores)






        

            


In [86]:
class ModelTrainer:
    def __init__(self, config:ModelTrainerConfig):
        self.config = config

    def train(self):
        self.train_data = pd.read_csv(self.config.train_data_path)
        self.test_data = pd.read_csv(self.config.test_data_path)

        self.x_train = self.train_data.drop(columns=['Load_Type'])
        print(f'this is the self.x_train dataset {self.x_train.columns}')
        self.y_train = self.train_data['Load_Type']
        self.x_test = self.test_data.drop(columns=['Load_Type'])
        self.y_test = self.test_data['Load_Type']

    # model monitering by evidently ai open source tool
    def model_monitering(self):  
        
        self.target = 'Load_Type'   # target variable # Recording the prediction values so for that i have create a  column name prediction
        # Reference data means historical data we train the model ,Current data means upcoming or new data that we are going to train the model
        # All features are numerical (assuming this is correct)
        self.prediction = 'prediction'  
        
        self.numerical_features = ['WeekStatus_Weekday', 'WeekStatus_Weekend', 'Usage_kWh',
                                  'Lagging_Reactive_Power_kVarh', 'Leading_Reactive_Power_kVarh',
                                  'CO2', 'Lagging_Power_Factor', 'Leading_Power_Factor', 'NSM', 'hour']

        self.model = ExtraTreesClassifier()
        self.model.fit(self.x_train, self.y_train)

        self.train_data['prediction'] = self.model.predict(self.x_train) # iam considering this self.train_data as historical data or reference data 
        self.test_data['prediction'] = self.model.predict(self.x_test)   # iam considering this self.test_data as upcoming data or current new data to compare performance with reference 

        self.train_pred=self.train_data['prediction']
        self.test_pred=self.test_data['prediction']

        column_mapping = ColumnMapping()
        column_mapping.target = self.target
        column_mapping.prediction = self.prediction
        column_mapping.numerical_features = self.numerical_features

        classification_performance_report.run(reference_data=self.train_data, current_data=self.test_data, column_mapping=column_mapping)  
        data_drift_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)
        data_quality_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)
        multiclass_cat_target_drift_report.run(current_data=self.test_data, reference_data=self.train_data, column_mapping=None)

        data_drift_report.save_html(os.path.join(self.config.root_dir, "data_drift_file.html"))
        classification_performance_report.save_html(os.path.join(self.config.root_dir, "Classification_report.html"))
        data_quality_report.save_html(os.path.join(self.config.root_dir, "Data_quality_report.html"))
        multiclass_cat_target_drift_report.save_html(os.path.join(self.config.root_dir, "target_drift_report.html"))
        
        #data_drift_report.save_html("data_drift_file.html")
        #classification_performance_report.save_html('Classification report.html')
        #data_quality_report.save_html("Data_quality_report.html")
        #multiclass_cat_target_drift_report.save_html("target_drift_report.html")

    def evaluate_model(self):  # Renamed method to evaluate_model
        model=self.model
        train_accuracy = accuracy_score(self.y_train, self.train_pred)
        test_accuracy = accuracy_score(self.y_test, self.test_pred)

        train_cm = confusion_matrix(self.y_train, self.train_pred)
        test_cm = confusion_matrix(self.y_test, self.test_pred)

        train_precision = precision_score(self.y_train, self.train_pred, average='weighted')
        test_precision = precision_score(self.y_test, self.test_pred, average='weighted')

        train_recall = recall_score(self.y_train, self.train_pred, average='weighted')
        test_recall = recall_score(self.y_test, self.test_pred, average='weighted')

        train_f1 = f1_score(self.y_train, self.train_pred, average='weighted')
        test_f1 = f1_score(self.y_test, self.test_pred, average='weighted')

        scores={
                'Model': type(model).__name__,
                'Training Accuracy': train_accuracy,
                'Testing Accuracy': test_accuracy,
                'Training Precision': train_precision,
                'Testing Precision': test_precision,
                'Training Recall': train_recall,
                'Testing Recall': test_recall,
                'Training F1-score': train_f1,
                'Testing F1-score': test_f1
                }

        for metric, value in scores.items():
            print(f"{metric}: {value}")

        joblib.dump(model, os.path.join(self.config.root_dir, f"{type(model).__name__}_model.joblib"))

        model = joblib.load("artifacts\model_trainer\ExtraTreesClassifier_model.joblib")  

        single_sample = {
            'Usage_kWh': 8.46,
            'Lagging_Reactive_Power_kVarh': 0,
            'Leading_Reactive_Power_kVarh': 25.92,
            'CO2': 0,
            'Lagging_Power_Factor': 100,
            'Leading_Power_Factor': 31.03,
            'NSM': 45000,
            'WeekStatus_Weekday': 1,
            'WeekStatus_Weekend': 0,
            'hour': 20
        }
                    
        input_data = pd.DataFrame([single_sample])
        input_data = input_data[self.x_train.columns]

        prediction = model.predict(input_data)

        print("Predicted class label:", prediction)

        return pd.DataFrame(scores, index=[0])


In [87]:
try:
    config = ConfigurationManager() # here iam initializing my ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config() # here iam getting my get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config) # here iam passing my model_trainer_config to the ModelTrainer function
    model_trainer.train() # here iam training the model
    model_trainer.model_monitering()
    model_trainer.evaluate_model()
except Exception as e:
    logger.exception("Exception occurred")
    raise e

[2024-04-20 00:08:57,424: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-20 00:08:57,440: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-20 00:08:57,446: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-04-20 00:08:57,451: INFO: common: created directory at: artifacts]
[2024-04-20 00:08:57,455: INFO: common: created directory at: artifacts/model_trainer]


this is the self.x_train dataset Index(['WeekStatus_Weekday', 'WeekStatus_Weekend', 'Usage_kWh',
       'Lagging_Reactive_Power_kVarh', 'Leading_Reactive_Power_kVarh', 'CO2',
       'Lagging_Power_Factor', 'Leading_Power_Factor', 'NSM', 'hour'],
      dtype='object')
Model: ExtraTreesClassifier
Training Accuracy: 0.9996802911809551
Testing Accuracy: 0.9550686144311642
Training Precision: 0.9996802803009861
Testing Precision: 0.9552925558070994
Training Recall: 0.9996802911809551
Testing Recall: 0.9550686144311642
Training F1-score: 0.9996802821192954
Testing F1-score: 0.9550417079897668
Predicted class label: [2]
