# Data Preparation

In [1]:
import shap
import lime
import pandas as pd 
import numpy as np 
import logging
from abc import ABC, abstractmethod
import numpy as np 
import pandas as pd 
from typing import Union, Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Kinda useless ngl ._.
# logging.basicConfig(filename='preprocessing.log',
#                     filemode='a',
#                     format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
#                     datefmt='%H:%M:%S',
#                     level=logging.DEBUG)

logger = logging.getLogger(__name__)


# Abstract class is a strategy for handling data
class DataStrategy(ABC):
    """Abstract class defining strategy for handling data

    Args:
        ABC (_type_): _description_
    """
    @abstractmethod
    def handle_data(self, data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
        pass 
    
    
class DataPreProcessStrategy(DataStrategy):
    """Inherit the datastrategy and overwrite the handle_data method provided by the DataStrategy above"""
    
    def handle_data(self, data: pd.DataFrame) -> pd.DataFrame: 
        """Preprocess the dataframe

        Args:
            data (pd.DataFrame): DataFrame that need to be preprocessed.
        """
        # Drop useless coloumns
        logger.info("Begin to preprocessing the dataframe ...")
        try: 
            logger.info("1. Start dropping useless columns")
            data = data.drop(columns=[
                "Account length", 
                "State", 
                "Area code"
            ])
            logger.info("Delete useless columns complete")
        except Exception as e:
            logger.exception(f"Encounting an exception when dropping columns")
            raise e
        
        # Convert data type.
        try: 
            # Converting object column to category
            logging.info("2. Converting data to it correct data types")
            for i in data.select_dtypes(include='object').columns.to_list(): 
                data[i] = data[i].astype('category')
            # Converting target column to category 
            data['Churn'] = data['Churn'].astype('category')
            logging.info("Converting data type complete")
        except Exception as e:
            logger.exception(f"Encounting an exception when convert data type")
            raise e
        
        # Handling null value.
        try: 
            if data.isnull().sum().any()==True:
                logging.info ("3. Handling null values")   
                for i in data.select_dtypes(include=['int64', 'float64']).columns.to_list():
                    data[i].fillna(data[i].mean(), inplace=True)
                data = data.dropna()
            else: 
                logging.info("3. The data had no missing values")
        except Exception as e: 
            logger.exception(f"Encounting an exception when handling null value")
        
        # Scale if needed. 
        # Identify numerical columns
        logging.info('4. Encoding the values')
        num_col = data.select_dtypes(include=['int64', 'float64']).columns.values.tolist()

        # Identify categorical columns 
        cat_col = data.select_dtypes(include='category').columns.values.tolist()

        # Encoding the data
        numeric_transformer = Pipeline(
            steps=[("Scaler", StandardScaler())]
        )

        categorical_transformer = Pipeline(
            steps=[('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, num_col),
                ('cat', categorical_transformer, cat_col)
            ]
        )

        encoded_data = preprocessor.fit_transform(data)
        
        # Create new names
        # Keep OG names for num_col
        new_num_col = num_col
        
        # Change name for cat_col 
        new_cat_col = preprocessor.named_transformers_['cat'].named_steps['OneHotEncoder'].get_feature_names_out(cat_col)
        
        # Combine to have new col names
        columns = list(new_num_col) + list(new_cat_col)
        
        encoded_data = pd.DataFrame(encoded_data, columns=columns)
        return encoded_data
    

class DataDivideStrategy(DataStrategy): 
    """Split the data into the dataframes for training and testing process. 

    Args:
        Dataframe (pd.DataFrame): Take in the already encoded dataframe
    """
    
    def handle_data(self, data) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        data = pd.DataFrame(data)
        X = data.iloc[:, :-2]  # All columns except the last two
        y = data.iloc[:, -2:]  # The last two columns

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

        return X_train, y_train, X_valid, y_valid
    
class DataCleaning(DataStrategy):
    """
    Data cleaning class which preprocesses the data and divides it into train and test data.
    """

    def __init__(self, data: pd.DataFrame, strategy: DataStrategy) -> None:
        """Initializes the DataCleaning class with a specific strategy."""
        self.df = data
        self.strategy = strategy

    def handle_data(self):
        """Handle data based on the provided strategy"""
        return self.strategy.handle_data(self.df)
    


In [2]:
import pandas as pd 
import numpy as np  
import logging 
from zenml import step 
from typing import Tuple, Annotated

def clean_data(df: pd.DataFrame) -> Tuple[
    Annotated[pd.DataFrame, 'X_train'],
    Annotated[pd.DataFrame, 'X_valid'],
    Annotated[pd.DataFrame, 'y_train'],
    Annotated[pd.DataFrame, 'y_valid']
]:
    try: 
        process_strategy = DataPreProcessStrategy()
        data_cleaning = DataCleaning(df, process_strategy)
        cleaned_data = data_cleaning.handle_data()
        
        divide_strategy = DataDivideStrategy()
        data_dividing = DataCleaning(cleaned_data, divide_strategy)
        dataframe = data_dividing.handle_data()
        return dataframe
    except Exception as e:
        logging.error(f"Error cleaning data: {e}")
        raise e 


In [3]:
# Read and transform the data
dataframe = pd.read_csv('../data/telecom_churn.csv')
X_train, y_train, X_test, y_test = clean_data(dataframe)

[1;35mBegin to preprocessing the dataframe ...[0m
[1;35m1. Start dropping useless columns[0m
[1;35mDelete useless columns complete[0m
[1;35m2. Converting data to it correct data types[0m
[1;35mConverting data type complete[0m
[1;35m3. The data had no missing values[0m
[1;35m4. Encoding the values[0m


In [4]:
import logging
from abc import ABC, abstractmethod
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import numpy as np
from typing import Dict 
import pandas as pd 
import typing


logging.basicConfig(filename='logs/Evaluate.log')
logger = logging.getLogger(__name__)


class Evaluation(ABC): 
    @abstractmethod
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray):
        pass 
    

class ConfusionMatrix(Evaluation):
    """Evaluattion Strategy that used Confusion Matrix
    """
    def calculate_score(self, y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str , np.ndarray]:
        try: 
            # Convert DataFrame or Series to NumPy array
            if isinstance(y_true, (pd.DataFrame, pd.Series)):
                y_true = y_true.to_numpy().ravel()
            if isinstance(y_pred, (pd.DataFrame, pd.Series)):
                y_pred = y_pred.to_numpy().ravel()
            logger.info('Calculating Confusion matrix')
            cm = confusion_matrix(y_true=y_true.argmax(axis=1), y_pred=y_pred.argmax(axis=1))
            logger.info('Calculate confusion matrix complete')
            return {'Confusion matrix': cm}
        except Exception as e: 
            logging.error("Error in calculating Confusion matrix: {}".format(e))
            raise e 
        

class ClassificationMetrics(Evaluation):
    """ Evaluate strategy that uses other metrics like AUC, accuracy, etc. """
    
    def calculate_score(self, y_true: typing.Union[np.ndarray, pd.Series, pd.DataFrame], 
                        y_pred: typing.Union[np.ndarray, pd.Series, pd.DataFrame]) -> Dict[str, float]:
        try:
            # Convert DataFrame or Series to NumPy array
            if isinstance(y_true, (pd.DataFrame, pd.Series)):
                y_true = y_true.to_numpy().ravel()
            if isinstance(y_pred, (pd.DataFrame, pd.Series)):
                y_pred = y_pred.to_numpy().ravel()
            
            logger.info('Calculating other metrics')
            accuracy = accuracy_score(y_true, y_pred)
            f1 = f1_score(y_true, y_pred, average='micro')
            recall = recall_score(y_true, y_pred, average='micro')
            precision = precision_score(y_true, y_pred, average='micro')
            logger.info('Calculate metrics complete')
            return {
                'accuracy': accuracy, 
                'f1': f1, 
                'recall': recall, 
                'precision': precision
            }
        except Exception as e:
            logger.error("Error in calculating other metrics: {}".format(e))
            raise e

In [5]:
from zenml import step
import pandas as pd
import logging 
from typing import Union, Tuple, Annotated
from sklearn.base import BaseEstimator, ClassifierMixin

logging.basicConfig(filename='logs/evaluate.log')
logger = logging.getLogger(__name__)


def evaluate_model(
    model: ClassifierMixin,
    feature: Union[pd.DataFrame | pd.Series | np.ndarray],
    target: Union[pd.DataFrame | pd.Series] | np.ndarray
    ) -> Tuple[
        Annotated[dict, "Confusion matrix"] ,
        Annotated[dict, "Other metrics"]
        ]: 
    prediction = model.predict(feature)
    cm_class = ConfusionMatrix()
    cm = cm_class.calculate_score(y_true=target, y_pred=prediction)

    metric_class = ClassificationMetrics()
    metrics = metric_class.calculate_score(y_true=target, y_pred=prediction)
    
    return cm, metrics

# Train and Evaluate model

# XAI with these models