# Import Data preparation

In [13]:
import logging
from abc import ABC, abstractmethod
import numpy as np 
import pandas as pd 
from typing import Union, Tuple
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Kinda useless ngl ._.
# logging.basicConfig(filename='preprocessing.log',
#                     filemode='a',
#                     format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
#                     datefmt='%H:%M:%S',
#                     level=logging.DEBUG)

logger = logging.getLogger(__name__)


# Abstract class is a strategy for handling data
class DataStrategy(ABC):
    """Abstract class defining strategy for handling data

    Args:
        ABC (_type_): _description_
    """
    @abstractmethod
    def handle_data(self, data: pd.DataFrame) -> Union[pd.DataFrame, pd.Series]:
        pass 
    
    
class DataPreProcessStrategy(DataStrategy):
    """Inherit the datastrategy and overwrite the handle_data method provided by the DataStrategy above"""
    
    def handle_data(self, data: pd.DataFrame) -> pd.DataFrame: 
        """Preprocess the dataframe

        Args:
            data (pd.DataFrame): DataFrame that need to be preprocessed.
        """
        # Drop useless coloumns
        logger.info("Begin to preprocessing the dataframe ...")
        try: 
            logger.info("1. Start dropping useless columns")
            data = data.drop(columns=[
                "Account length", 
                "State", 
                "Area code"
            ])
            logger.info("Delete useless columns complete")
        except Exception as e:
            logger.exception(f"Encounting an exception when dropping columns")
            raise e
        
        # Convert data type.
        try: 
            # Converting object column to category
            logging.info("2. Converting data to it correct data types")
            for i in data.select_dtypes(include='object').columns.to_list(): 
                data[i] = data[i].astype('category')
            # Converting target column to category 
            data['Churn'] = data['Churn'].astype('category')
            logging.info("Converting data type complete")
        except Exception as e:
            logger.exception(f"Encounting an exception when convert data type")
            raise e
        
        # Handling null value.
        try: 
            if data.isnull().sum().any()==True:
                logging.info ("3. Handling null values")   
                for i in data.select_dtypes(include=['int64', 'float64']).columns.to_list():
                    data[i].fillna(data[i].mean(), inplace=True)
                data = data.dropna()
            else: 
                logging.info("3. The data had no missing values")
        except Exception as e: 
            logger.exception(f"Encounting an exception when handling null value")
        
        # Scale if needed. 
        # Identify numerical columns
        logging.info('4. Encoding the values')
        num_col = data.select_dtypes(include=['int64', 'float64']).columns.values.tolist()

        # Identify categorical columns 
        cat_col = data.select_dtypes(include='category').columns.values.tolist()

        # Encoding the data
        numeric_transformer = Pipeline(
            steps=[("Scaler", StandardScaler())]
        )

        categorical_transformer = Pipeline(
            steps=[('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, num_col),
                ('cat', categorical_transformer, cat_col)
            ]
        )

        encoded_data = preprocessor.fit_transform(data)
        
        # Create new names
        # Keep OG names for num_col
        new_num_col = num_col
        
        # Change name for cat_col 
        new_cat_col = preprocessor.named_transformers_['cat'].named_steps['OneHotEncoder'].get_feature_names_out(cat_col)
        
        # Combine to have new col names
        columns = list(new_num_col) + list(new_cat_col)
        
        encoded_data = pd.DataFrame(encoded_data, columns=columns)
        return encoded_data
    

class DataDivideStrategy(DataStrategy): 
    """Split the data into the dataframes for training and testing process. 

    Args:
        Dataframe (pd.DataFrame): Take in the already encoded dataframe
    """
    
    def handle_data(self, data) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        data = pd.DataFrame(data)
        X = data.iloc[:, :-2]  # All columns except the last two
        y = data.iloc[:, -2:]  # The last two columns

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

        return X_train, y_train, X_valid, y_valid
    
class DataCleaning(DataStrategy):
    """
    Data cleaning class which preprocesses the data and divides it into train and test data.
    """

    def __init__(self, data: pd.DataFrame, strategy: DataStrategy) -> None:
        """Initializes the DataCleaning class with a specific strategy."""
        self.df = data
        self.strategy = strategy

    def handle_data(self):
        """Handle data based on the provided strategy"""
        return self.strategy.handle_data(self.df)
    


In [14]:
import pandas as pd 
import numpy as np  
import logging 
from zenml import step 
from typing import Tuple, Annotated

@step()
def clean_data(df: pd.DataFrame) -> Tuple[
    Annotated[pd.DataFrame, 'X_train'],
    Annotated[pd.DataFrame, 'X_valid'],
    Annotated[pd.DataFrame, 'y_train'],
    Annotated[pd.DataFrame, 'y_valid']
]:
    try: 
        process_strategy = DataPreProcessStrategy()
        data_cleaning = DataCleaning(df, process_strategy)
        cleaned_data = data_cleaning.handle_data()
        
        divide_strategy = DataDivideStrategy()
        data_dividing = DataCleaning(cleaned_data, divide_strategy)
        dataframe = data_dividing.handle_data()
        return dataframe
    except Exception as e:
        logging.error(f"Error cleaning data: {e}")
        raise e 


In [15]:
# Read and transform the data
dataframe = pd.read_csv('../data/raw/telecom_churn.csv')
X_train, y_train, X_test, y_test = clean_data(dataframe)

[1;35mBegin to preprocessing the dataframe ...[0m
[1;35m1. Start dropping useless columns[0m
[1;35mDelete useless columns complete[0m
[1;35m2. Converting data to it correct data types[0m
[1;35mConverting data type complete[0m
[1;35m3. The data had no missing values[0m
[1;35m4. Encoding the values[0m


In [9]:
y_train

Unnamed: 0,18,19
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
2661,1.0,0.0
2662,0.0,1.0
2663,1.0,0.0
2664,0.0,1.0


# Import Model class

In [4]:
from abc import ABC, abstractmethod
import logging
import optuna
from sklearn.base import ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

logger = logging.getLogger(__name__)

class Model(ABC): 
    def __init__(self, *args, **kwargs): 
        pass 
    
    @abstractmethod
    def fit(self, X, y): 
        pass 
    
    @abstractmethod
    def predict(self, X): 
        pass 
    
    @abstractmethod
    def optimize(self, trial: optuna.Trial, X_train, y_train, X_test, y_test): 
        """Optimize the hyperparameter of the model[]

        Args:
            trial (Optuna Trial object): Optuna Trial object
            X_train (ndarray): Training feature
            y_train (ndarray): Training target
            X_test (ndarray): Testing feature
            y_test (ndarray): Testing target
            
        """
        pass


class LogisticRegressionClassification(Model): 
    def __init__(self, *args, **kwargs): 
        super().__init__(*args, **kwargs)
        self.model = LogisticRegression(*args, **kwargs)
    
    def fit(self, X, y): 
        try: 
            self.model.fit(X, y)
            logger.info(f"Model {self.__class__.__name__} trained successfully")
            return self.model
        except ValueError as e: 
            logger.error(f"Error: {e}")
            return None
    
    def predict(self, X): 
        return self.model.predict(X)
    
    def optimize(self, trial, X_train, y_train, X_test, y_test):
        return super().optimize(trial, X_train, y_train, X_test, y_test)
    

class SVM(Model): 
    def __init__(self, *args, **kwargs): 
        super().__init__(*args, **kwargs) 
        self.model = SVC(*args, **kwargs)
    
    def fit(self, X, y): 
        try:    
            self.model.fit(X, y)
            logger.info(f"Model {self.__class__.__name__} trained successfully")
            return self.model
        except ValueError as e: 
            logger.error(f"Error: {e}")
            return None
    
    def predict(self, X): 
        return self.model.predict(X)

        

class RandomForest(Model): 
    def __init__(self, *args, **kwargs): 
        super().__init__(*args, **kwargs)
        self.model = RandomForestClassifier(*args, **kwargs)
    
    def fit(self, X, y):
        try: 
            self.model.fit(X, y)
            logger.info(f"Model {self.__class__.__name__} trained successfully")
            return self.model
        except ValueError as e: 
            logger.error(f"Error: {e}")
            return None
    
    def predict(self, X): 
        return self.model.predict(X)
    
    def optimize(self, trial: optuna.Trial, X_train, y_train, X_test, y_test):
        n_estimators = trial.suggest_int("")
        
class HyperparameterTuner: 
    """
    Class for perming hyperparameter tuning using Optuna based on Model Strategy
    """
    
    def __init__(self, model: Model, X_train, y_train, X_test, y_test ):
        self.model = model 
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
    def optimize(self, n_trials = 100): 
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: self.model.optimize(trial, self.X_train, self.y_train, 
                                                         self.X_test, self.y_test), 
                                                         n_trials = n_trials)
        return study.best_trial.params
        

# Test the model 

In [16]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2666, 18) (2666, 2) (667, 18) (667, 2)


In [17]:
X_train

Unnamed: 0,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,International plan_No,International plan_Yes,Voice mail plan_No,Voice mail plan_Yes
0,1.234883,1.566767,0.476643,1.567036,-0.070610,-0.055940,-0.070427,0.866743,-0.465494,0.866029,-0.085008,-0.601195,-0.085690,-0.427932,1.0,0.0,0.0,1.0
1,1.307948,-0.333738,1.124503,-0.334013,-0.108080,0.144867,-0.107549,1.058571,0.147825,1.059390,1.240482,-0.601195,1.241169,-0.427932,1.0,0.0,0.0,1.0
2,-0.591760,1.168304,0.675985,1.168464,-1.573383,0.496279,-1.573900,-0.756869,0.198935,-0.755571,0.703121,0.211534,0.697156,-1.188218,1.0,0.0,1.0,0.0
3,-0.591760,2.196596,-1.466936,2.196759,-2.742865,-0.608159,-2.743268,-0.078551,-0.567714,-0.078806,-1.303026,1.024263,-1.306401,0.332354,0.0,1.0,1.0,0.0
4,-0.591760,-0.240090,0.626149,-0.240041,-1.038932,1.098699,-1.037939,-0.276311,1.067803,-0.276562,-0.049184,-0.601195,-0.045885,1.092641,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2661,-0.591760,0.993861,-0.221052,0.993481,0.353401,0.546480,0.354165,-0.355416,0.454484,-0.355664,-0.264128,-0.601195,-0.258182,-1.188218,1.0,0.0,1.0,0.0
2662,-0.591760,-1.472204,0.376972,-1.472482,2.124387,0.094665,2.124458,-0.784556,-1.947682,-0.786333,1.240482,0.617898,1.241169,0.332354,0.0,1.0,1.0,0.0
2663,-0.591760,-0.183166,1.124503,-0.182793,-0.352626,-0.306949,-0.353488,3.839081,-1.436583,3.836763,0.882241,1.024263,0.882917,0.332354,1.0,0.0,1.0,0.0
2664,1.307948,0.349342,0.725820,0.349717,-0.678030,-1.662395,-0.678312,-0.046909,0.914473,-0.048044,1.491250,-0.601195,1.493272,-0.427932,0.0,1.0,0.0,1.0


In [18]:
y_train

Unnamed: 0,Churn_0.0,Churn_1.0
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
2661,1.0,0.0
2662,0.0,1.0
2663,1.0,0.0
2664,0.0,1.0


In [35]:
rfc = RandomForest(n_estimators = 50)
rfc_train = rfc.fit(X_train,y_train)

[1;35mModel RandomForest trained successfully[0m


In [36]:
rfc_train

In [37]:
predict = rfc_train.predict(X_test)
print(predict.shape)

(667, 2)


In [38]:
predict

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [39]:
y_test = y_test.to_numpy()

In [40]:
y_test

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [41]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.234883,1.566767,0.476643,1.567036,-0.070610,-0.055940,-0.070427,0.866743,-0.465494,0.866029,-0.085008,-0.601195,-0.085690,-0.427932,1.0,0.0,0.0,1.0
1,1.307948,-0.333738,1.124503,-0.334013,-0.108080,0.144867,-0.107549,1.058571,0.147825,1.059390,1.240482,-0.601195,1.241169,-0.427932,1.0,0.0,0.0,1.0
2,-0.591760,1.168304,0.675985,1.168464,-1.573383,0.496279,-1.573900,-0.756869,0.198935,-0.755571,0.703121,0.211534,0.697156,-1.188218,1.0,0.0,1.0,0.0
3,-0.591760,2.196596,-1.466936,2.196759,-2.742865,-0.608159,-2.743268,-0.078551,-0.567714,-0.078806,-1.303026,1.024263,-1.306401,0.332354,0.0,1.0,1.0,0.0
4,-0.591760,-0.240090,0.626149,-0.240041,-1.038932,1.098699,-1.037939,-0.276311,1.067803,-0.276562,-0.049184,-0.601195,-0.045885,1.092641,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2661,-0.591760,0.993861,-0.221052,0.993481,0.353401,0.546480,0.354165,-0.355416,0.454484,-0.355664,-0.264128,-0.601195,-0.258182,-1.188218,1.0,0.0,1.0,0.0
2662,-0.591760,-1.472204,0.376972,-1.472482,2.124387,0.094665,2.124458,-0.784556,-1.947682,-0.786333,1.240482,0.617898,1.241169,0.332354,0.0,1.0,1.0,0.0
2663,-0.591760,-0.183166,1.124503,-0.182793,-0.352626,-0.306949,-0.353488,3.839081,-1.436583,3.836763,0.882241,1.024263,0.882917,0.332354,1.0,0.0,1.0,0.0
2664,1.307948,0.349342,0.725820,0.349717,-0.678030,-1.662395,-0.678312,-0.046909,0.914473,-0.048044,1.491250,-0.601195,1.493272,-0.427932,0.0,1.0,0.0,1.0


In [49]:
y_train.head(20)

Unnamed: 0,18,19
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
5,1.0,0.0
6,1.0,0.0
7,1.0,0.0
8,1.0,0.0
9,1.0,0.0


In [52]:
y_test

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])