In [1]:
#Import necessairy libraries
import os
import sys
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from abc import ABC, abstractmethod
from typing import Union, Tuple
from sklearn.model_selection import train_test_split
import pandas as pd
from typing_extensions import Annotated
sys.path.append(os.path.abspath(os.path.join('..'))) 
from log import logger
from config import model_classes , param_grid
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



## 1.Data_Ingestion

In [2]:
class IngestData:
    """
    Data ingestion class which ingests data from the source and returns a DataFrame.
    """

    def __init__(self, csv_path) -> None:
        """Initialize the data ingestion class with the CSV file path."""
        self.csv_path = csv_path

    def get_data(self) -> pd.DataFrame:
        try:
            df = pd.read_csv(self.csv_path)
            logger.info("Data ingestion successful.")
            return df
        except Exception as e:
            logger.error(f"Error during data ingestion: {e}")
            raise e

def ingest_data(csv_path: str) -> pd.DataFrame:
    """
    Args:
        csv_path (str): Path to the CSV file.
    Returns:
        df: pd.DataFrame
    """
    try:
        ingest_data_obj = IngestData(csv_path)
        df = ingest_data_obj.get_data()
        return df
    except Exception as e:
        logger.error(f"Error during data ingestion process: {e}")
        raise e


In [3]:
# Define the path to the CSV file
csv_path = "../data/raw_data.csv"

# Call the ingest_data function with the CSV file path
data_frame = ingest_data(csv_path)

[2023-09-02 10:35:11,932: INFO: 345487086: Data ingestion successful.]


## 2.Data_Cleaning

In [4]:
class DataStrategy(ABC):
    """
    Abstract Class defining strategy for handling data
    """

    @abstractmethod
    def handle_data(self, data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
        pass

In [5]:
class DataPreprocessStrategy(DataStrategy):
    """
    Data preprocessing strategy which preprocesses the data.
    """
    def handle_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Removes columns which are not required, fills missing values with median average values, and converts the data type to float.
        """
        try:
            # List of columns to drop
            columns_to_drop = ['Employee_Name', 'EmpID', 'ManagerName', 'ManagerID', 'DateofHire', 'DateofTermination','LastPerformanceReview_Date','DOB']
            cleaned_df  = data.drop(columns=columns_to_drop, axis=1)
            logger.info("Colum dropping successful.")
        except Exception as e:
            logger.error(f"Error during column dropping: {e}")
            raise e
        try:
            # Identify categorical columns
            categorical_columns = cleaned_df.select_dtypes(include=['object']).columns

            # Create a LabelEncoder instance
            label_encoder = LabelEncoder()

            # Apply label encoding to each categorical column
            for column in categorical_columns:
                cleaned_df[column] = label_encoder.fit_transform(cleaned_df[column])
            logger.info("Label encoding successful.")
        except Exception as e:
            logger.error(f"Error during label encoding: {e}")
            raise e
        return cleaned_df

In [6]:
class DataDivideStrategy(DataStrategy):
    """
    Data dividing strategy which divides the data into train and test data.
    """

    def handle_data(self, data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        """
        Divides the data into train and test data.
        """
        try:
            X = data.drop("Termd", axis=1)
            y = data["Termd"]
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )
            return X_train, X_test, y_train, y_test
        except Exception as e:
            logger.error(e)
            raise e


In [7]:
class DataCleaning:
    """
    Data cleaning class which preprocesses the data and divides it into train and test data.
    """

    def __init__(self, data: pd.DataFrame, strategy: DataStrategy) -> None:
        """Initializes the DataCleaning class with a specific strategy."""
        self.df = data
        self.strategy = strategy

    def handle_data(self) -> Union[pd.DataFrame, pd.Series]:
        """Handle data based on the provided strategy"""
        return self.strategy.handle_data(self.df)

In [8]:
def clean_data(
    data: pd.DataFrame,
) -> Tuple[
    Annotated[pd.DataFrame, "x_train"],
    Annotated[pd.DataFrame, "x_test"],
    Annotated[pd.Series, "y_train"],
    Annotated[pd.Series, "y_test"],
]:
    """Data cleaning class which preprocesses the data and divides it into train and test data.

    Args:
        data: pd.DataFrame
    """
    try:
        preprocess_strategy = DataPreprocessStrategy()
        data_cleaning = DataCleaning(data, preprocess_strategy)
        preprocessed_data = data_cleaning.handle_data()
        logger.info("Data Preproessing successful.")

        divide_strategy = DataDivideStrategy()
        data_cleaning = DataCleaning(preprocessed_data, divide_strategy)
        x_train, x_test, y_train, y_test = data_cleaning.handle_data()
        logger.info("Data dividing successful.")
        return x_train, x_test, y_train, y_test
    except Exception as e:
        logger.error(e)
        raise e


In [9]:
x_train, x_test, y_train, y_test  = clean_data(data_frame)

[2023-09-02 10:35:13,611: INFO: 580182248: Colum dropping successful.]
[2023-09-02 10:35:13,630: INFO: 580182248: Label encoding successful.]
[2023-09-02 10:35:13,636: INFO: 4176965535: Data Preproessing successful.]
[2023-09-02 10:35:13,648: INFO: 4176965535: Data dividing successful.]


## 3.Model_Training_&_Tuning

In [10]:
class Model(ABC):
    """
    Abstract base class for all models.
    """

    @abstractmethod
    def train(self, x_train, y_train):
        pass

    @abstractmethod
    def optimize(self, x_train, y_train, x_test, y_test):
        pass

class ModelTuner:
    """
    Class for performing hyperparameter tuning. It uses Model strategy to perform tuning.
    """

    def __init__(self, model_classes, param_grid, x_train, y_train, x_test, y_test):
        self.model_classes = dict(model_classes)  # Make a copy of model_classes as a dictionary
        self.param_grid = param_grid
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def train(self, model_name, **kwargs):
        try:
            model_class = self.model_classes[model_name]
            clf = model_class(**kwargs)
            clf.fit(self.x_train, self.y_train)
            logger.info(f"Model {model_name} training successful.")
            return clf
        except Exception as e:
            logger.error(f"Model {model_name} training failed: {str(e)}")
            raise e

    def optimize(self, model_name):
        try:
            model_class = self.model_classes[model_name]
            clf = model_class()
            grid_search = RandomizedSearchCV(clf, param_distributions=self.param_grid[model_name], n_iter=10, cv=5, n_jobs=-1)
            grid_search.fit(self.x_train, self.y_train)

            best_params = grid_search.best_params_
            best_model = grid_search.best_estimator_

            y_pred = best_model.predict(self.x_test)
            logger.info(f"Hyperparameter tuning for {model_name} successful.")
            return best_model, best_params
        except Exception as e:
            logger.error(f"Hyperparameter tuning for {model_name} failed: {str(e)}")
            raise e


In [11]:
def execute_model_tuning(x_train, y_train, x_test, y_test, model_classes, param_grid):
    best_f1_score = 0
    best_model = None
    best_params = None
    best_model_name = None

    for model_name, model_class in model_classes.items():
        model = model_class()  # Create an instance of the model
        model_tuner = ModelTuner({model_name: model_class}, param_grid, x_train, y_train, x_test, y_test)

        logger.info(f"Training Model {model_name}")
        model_tuner.train(model_name)  # Train the model

        logger.info(f"Optimizing hyperparameters for {model_name}")
        current_best_model, current_best_params = model_tuner.optimize(model_name)

        y_pred = current_best_model.predict(x_test)
        current_f1 = f1_score(y_test, y_pred)  # Calculate F1 score

        if current_f1 > best_f1_score:
            best_f1_score = current_f1
            best_model_name = model_name
            best_params = current_best_params
            best_model = current_best_model

    print(f"Best Model - {best_model_name} - Best F1 Score: {best_f1_score}")
    print(f"Best Parameters: {best_params}")

    saved_model_filename = f"../saved_model/{best_model_name}_best_model.pkl"
    joblib.dump(best_model, saved_model_filename)
    logger.info(f"Best model saved as '{saved_model_filename}'")

    return best_model


In [12]:
best_model = execute_model_tuning(x_train, y_train, x_test, y_test, model_classes, param_grid)


[2023-09-02 10:35:14,446: INFO: 1321146974: Training Model RandomForestClassifier]
[2023-09-02 10:35:14,906: INFO: 3134255834: Model RandomForestClassifier training successful.]
[2023-09-02 10:35:14,909: INFO: 1321146974: Optimizing hyperparameters for RandomForestClassifier]
[2023-09-02 10:35:33,940: INFO: 3134255834: Hyperparameter tuning for RandomForestClassifier successful.]
[2023-09-02 10:35:33,973: INFO: 1321146974: Training Model DecisionTreeClassifier]
[2023-09-02 10:35:33,979: INFO: 3134255834: Model DecisionTreeClassifier training successful.]
[2023-09-02 10:35:33,980: INFO: 1321146974: Optimizing hyperparameters for DecisionTreeClassifier]
[2023-09-02 10:35:34,218: INFO: 3134255834: Hyperparameter tuning for DecisionTreeClassifier successful.]
[2023-09-02 10:35:34,226: INFO: 1321146974: Training Model GradientBoostingClassifier]
[2023-09-02 10:35:34,319: INFO: 3134255834: Model GradientBoostingClassifier training successful.]
[2023-09-02 10:35:34,320: INFO: 1321146974: Opti

Best Model - RandomForestClassifier - Best F1 Score: 1.0
Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': None}


## 4.Model_Evaluation

In [14]:
class BinaryClassificationEvaluation(ABC):
    """
    Abstract Class defining the strategy for evaluating binary classification model performance
    """
    @abstractmethod
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        pass


class Accuracy(BinaryClassificationEvaluation):
    """
    Evaluation strategy that uses Accuracy for binary classification
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            accuracy: float
        """
        try:
            logging.info("Entered the calculate_score method of the Accuracy class")
            accuracy = accuracy_score(y_true, y_pred)
            logging.info("The accuracy score value is: " + str(accuracy))
            return accuracy
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the Accuracy class. Exception message: " + str(e)
            )
            raise e


class Precision(BinaryClassificationEvaluation):
    """
    Evaluation strategy that uses Precision for binary classification
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            precision: float
        """
        try:
            logging.info("Entered the calculate_score method of the Precision class")
            precision = precision_score(y_true, y_pred)
            logging.info("The precision score value is: " + str(precision))
            return precision
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the Precision class. Exception message: " + str(e)
            )
            raise e


class Recall(BinaryClassificationEvaluation):
    """
    Evaluation strategy that uses Recall for binary classification
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            recall: float
        """
        try:
            logging.info("Entered the calculate_score method of the Recall class")
            recall = recall_score(y_true, y_pred)
            logging.info("The recall score value is: " + str(recall))
            return recall
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the Recall class. Exception message: " + str(e)
            )
            raise e


class F1Score(BinaryClassificationEvaluation):
    """
    Evaluation strategy that uses F1 Score for binary classification
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            f1_score: float
        """
        try:
            logging.info("Entered the calculate_score method of the F1Score class")
            f1 = f1_score(y_true, y_pred)
            logging.info("The F1 score value is: " + str(f1))
            return f1
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the F1Score class. Exception message: " + str(e)
            )
            raise e


In [15]:
def binary_classification_evaluation(
    model, x_test: pd.DataFrame, y_test: pd.Series
) -> dict:
    """
    Evaluate a binary classification model using specified evaluation strategies.

    Args:
        model: Binary classification model (e.g., RandomForestClassifier).
        x_test: Test features (pd.DataFrame).
        y_test: True labels (pd.Series).
        
    Returns:
        evaluation_results: A dictionary containing metric names as keys and their scores as values.
    """
    evaluation_results = {}

    # Create instances of evaluation strategies
    accuracy_strategy = Accuracy()
    precision_strategy = Precision()
    recall_strategy = Recall()
    f1_score_strategy = F1Score()

    # Predict using the model
    y_pred = model.predict(x_test)

    # Calculate and store the evaluation metrics
    evaluation_results["Accuracy"] = accuracy_strategy.calculate_score(y_test.values, y_pred)
    evaluation_results["Precision"] = precision_strategy.calculate_score(y_test.values, y_pred)
    evaluation_results["Recall"] = recall_strategy.calculate_score(y_test.values, y_pred)
    evaluation_results["F1 Score"] = f1_score_strategy.calculate_score(y_test.values, y_pred)

    return evaluation_results




In [16]:
# Example usage:
evaluation_results = binary_classification_evaluation(best_model, x_test, y_test)
print(evaluation_results)

[2023-09-02 10:35:38,874: INFO: 3562115558: Entered the calculate_score method of the Accuracy class]
[2023-09-02 10:35:38,876: INFO: 3562115558: The accuracy score value is: 1.0]
[2023-09-02 10:35:38,878: INFO: 3562115558: Entered the calculate_score method of the Precision class]
[2023-09-02 10:35:38,885: INFO: 3562115558: The precision score value is: 1.0]
[2023-09-02 10:35:38,888: INFO: 3562115558: Entered the calculate_score method of the Recall class]
[2023-09-02 10:35:38,893: INFO: 3562115558: The recall score value is: 1.0]
[2023-09-02 10:35:38,895: INFO: 3562115558: Entered the calculate_score method of the F1Score class]
[2023-09-02 10:35:38,900: INFO: 3562115558: The F1 score value is: 1.0]


{'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'F1 Score': 1.0}


In [17]:

class Model(ABC):
    """
    Abstract base class for all models.
    """

    @abstractmethod
    def train(self, x_train, y_train):
        pass

    @abstractmethod
    def optimize(self, x_train, y_train, x_test, y_test):
        pass

class ModelTuner:
    """
    Class for performing hyperparameter tuning. It uses Model strategy to perform tuning.
    """

    def __init__(self, model_class, param_grid, x_train, y_train, x_test, y_test):
        self.model_class = model_class
        self.param_grid = param_grid
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

    def train(self, **kwargs):
        try:
            clf = self.model_class(**kwargs)
            clf.fit(self.x_train, self.y_train)
            logger.info("Model training successful.")  # Log training success
            return clf
        except Exception as e:
            logger.error(f"Model training failed: {str(e)}")
            raise e

    def optimize(self):
        try:
            clf = self.model_class()
            grid_search = RandomizedSearchCV(clf, param_distributions=self.param_grid, n_iter=10, cv=5, n_jobs=-1)
            grid_search.fit(self.x_train, self.y_train)

            best_params = grid_search.best_params_
            best_model = grid_search.best_estimator_

            y_pred = best_model.predict(self.x_test)
            logger.info("Hyperparameter tuning successful.")  # Log tuning success
            return  best_model, best_params
        except Exception as e:
            logger.error(f"Hyperparameter tuning failed: {str(e)}")
            raise e
def execute_model_tuning(x_train, y_train, x_test, y_test):
    # Define model classes and their respective names and hyperparameter grids
    model_classes = [
        ("RandomForestClassifier", RandomForestClassifier, {
            'n_estimators': [10, 50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
        }),
        ("DecisionTreeClassifier", DecisionTreeClassifier, {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
        }),
        ("GradientBoostingClassifier", GradientBoostingClassifier, {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5],
        }),
        ("KNeighborsClassifier", KNeighborsClassifier, {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
        }),
    ]

    best_f1_score = 0
    best_model = None
    best_params = None
    best_model_name = None  # Initialize best model name variable

    for model_name, model_class, param_grid in model_classes:
        tuner = ModelTuner(model_class, param_grid, x_train, y_train, x_test, y_test)
        logger.info(f"Training Model {model_name}")
        current_best_model = tuner.train()  # Train the model and log the training success
        best_f1, _, current_best_params = tuner.optimize()  # Optimize and log the tuning success

        if best_f1 > best_f1_score:
            best_f1_score = best_f1
            best_model = current_best_model
            best_params = current_best_params
            best_model_name = model_name  # Update the best model name

    # Print the best model and its parameters
    print(f"Best Model - {best_model_name} - Best F1 Score: {best_f1_score}")
    print(f"Best Parameters: {best_params}")

    # Save the best model as a .pkl file with the specified path
    saved_model_filename = f"../saved_model/{best_model_name}_best_model.pkl"
    joblib.dump(best_model, saved_model_filename)
    logger.info(f"Best model saved as '{saved_model_filename}'")  # Log model save success

# Call the function to execute model tuning
execute_model_tuning(x_train, y_train, x_test, y_test)

[2023-09-02 10:35:39,398: INFO: 2467076325: Training Model RandomForestClassifier]
[2023-09-02 10:35:39,645: INFO: 2467076325: Model training successful.]
[2023-09-02 10:35:47,739: INFO: 2467076325: Hyperparameter tuning successful.]


ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
from abc import ABC, abstractmethod
import numpy as np
import logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
class Evaluation(ABC):
    """
    Abstract Class defining the strategy for evaluating model performance in binary classification
    """
    @abstractmethod
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        pass

class Accuracy(Evaluation):
    """
    Evaluation strategy that uses Accuracy
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            accuracy: float
        """
        try:
            logging.info("Entered the calculate_score method of the Accuracy class")
            accuracy = accuracy_score(y_true, y_pred)
            logging.info("The accuracy value is: " + str(accuracy))
            return accuracy
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the Accuracy class. Exception message:  "
                + str(e)
            )
            raise e

class Precision(Evaluation):
    """
    Evaluation strategy that uses Precision
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            precision: float
        """
        try:
            logging.info("Entered the calculate_score method of the Precision class")
            precision = precision_score(y_true, y_pred)
            logging.info("The precision value is: " + str(precision))
            return precision
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the Precision class. Exception message:  "
                + str(e)
            )
            raise e

class Recall(Evaluation):
    """
    Evaluation strategy that uses Recall
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            recall: float
        """
        try:
            logging.info("Entered the calculate_score method of the Recall class")
            recall = recall_score(y_true, y_pred)
            logging.info("The recall value is: " + str(recall))
            return recall
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the Recall class. Exception message:  "
                + str(e)
            )
            raise e

class F1Score(Evaluation):
    """
    Evaluation strategy that uses F1 Score
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            f1_score: float
        """
        try:
            logging.info("Entered the calculate_score method of the F1Score class")
            f1_score = f1_score(y_true, y_pred)
            logging.info("The F1 score value is: " + str(f1_score))
            return f1_score
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the F1Score class. Exception message:  "
                + str(e)
            )
            raise e

class RocAuc(Evaluation):
    """
    Evaluation strategy that uses ROC AUC Score
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """
        Args:
            y_true: np.ndarray
            y_pred: np.ndarray
        Returns:
            roc_auc: float
        """
        try:
            logging.info("Entered the calculate_score method of the RocAuc class")
            roc_auc = roc_auc_score(y_true, y_pred)
            logging.info("The ROC AUC score value is: " + str(roc_auc))
            return roc_auc
        except Exception as e:
            logging.error(
                "Exception occurred in calculate_score method of the RocAuc class. Exception message:  "
                + str(e)
            )
            raise e
