# IMPORT

In [2]:
import pandas as pd
df = pd.read_csv("data.csv", delimiter=",")
df

Unnamed: 0,id,gender,customer_senior,customer_partner,dependent_family,month_tenure,phone_subscription,multiple_line,internet_type,online_security,...,device_protection_plan,techinal_support,streaming_tv,streaming_movie,contract_type,paperless_billing,payment_method_type,amount_charges_monthly,amount_total_charges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


# PIPELINE

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils.validation import check_array

from typing import List, Union, Optional
import pandas as pd
import numpy as np


#-------- Defining the custom transformers --------#

class DropNaN(BaseEstimator, TransformerMixin):
    def __init__(self, columns_list: Optional[List[str]] = None, reset_index: bool = False) -> None:
        """
        Transformer that drops rows containing NaN values in specified columns or all columns.

        Parameters
        ----------
        columns_list : list of str, optional (default=None)
            List of column names to check for NaN values. If None, all columns are checked.
        reset_index : bool, optional (default=False)
            If True, resets the index of the returned DataFrame after dropping rows.
        """
        self.columns_list = columns_list
        self.reset_index = reset_index

    def fit(self, X: Union[pd.DataFrame, np.ndarray], y=None) -> 'DropNaN':
        # Validate input
        X = self._validate_input(X)

        # Store column names
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X: Union[pd.DataFrame, np.ndarray], y=None) -> Union[pd.DataFrame, np.ndarray]:
        """
        Removes rows containing missing values in the specified columns or in all columns if none are specified.

        Parameters
        ----------
        X : {array-like, DataFrame}, shape (n_samples, n_features)
            The data to clean.

        y : array-like, shape (n_samples,), optional (default=None)
            Target values (ignored in this transformer).

        Returns
        -------
        X_transformed : same as input type
            `X` with rows containing missing values removed.

        y_transformed : same as y
            `y` with corresponding rows removed if `y` is provided.
        """
        is_array = isinstance(X, np.ndarray)
        X = self._validate_input(X)

        # Check for consistency in columns
        if X.columns.tolist() != self.feature_names_in_:
            raise ValueError("The columns in the input data during transform differ from those during fit.")

        # Determine columns to check for NaN
        cols_to_check = self.columns_list if self.columns_list else X.columns

        # Check if specified columns exist
        missing_cols = [col for col in cols_to_check if col not in X.columns]
        if missing_cols:
            raise ValueError(f"The following columns were not found: {missing_cols}")

        # Drop rows with NaN values in specified columns
        if y is not None:
            y = pd.Series(y, name='target') if not isinstance(y, pd.Series) else y
            Xy = pd.concat([X, y], axis=1)
            Xy_transformed = Xy.dropna(subset=cols_to_check)
            X_transformed = Xy_transformed.drop(columns=[y.name])
            y_transformed = Xy_transformed[y.name]

            # Reset index if required
            if self.reset_index:
                X_transformed.reset_index(drop=True, inplace=True)
                y_transformed.reset_index(drop=True, inplace=True)

            # Convert back to original type if input was an array
            if is_array:
                X_transformed = X_transformed.values
                y_transformed = y_transformed.values

            return X_transformed, y_transformed
        else:
            X_transformed = X.dropna(subset=cols_to_check)

            # Reset index if required
            if self.reset_index:
                X_transformed.reset_index(drop=True, inplace=True)

            # Convert back to original type if input was an array
            if is_array:
                X_transformed = X_transformed.values

            return X_transformed

    def _validate_input(self, X):
        # Validate X and convert to DataFrame if necessary
        if isinstance(X, pd.DataFrame):
            return X.copy()
        elif isinstance(X, np.ndarray):
            X = check_array(X, ensure_2d=True, allow_nd=False, dtype=None)
            return pd.DataFrame(X, columns=getattr(self, 'feature_names_in_', None))
        else:
            raise TypeError("Input must be a pandas DataFrame or a NumPy array.")

    def get_feature_names_out(self, input_features=None) -> List[str]:
        # Return the feature names
        return self.feature_names_in_ if input_features is None else input_features
  
    
class SeniorStatusTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column='customer_senior'):
        self.column = column

    def fit(self, X, y=None):
        return self  # Nothing to fit

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].map({0: 'No', 1: 'Yes'}).astype('object')
        return X


class PipelineWithY(Pipeline):
    def fit(self, X, y=None, **fit_params):
        for name, transform in self.steps:
            if hasattr(transform, 'fit_transform'):
                # Attempt to fit_transform with both X and y
                try:
                    result = transform.fit_transform(X, y)
                    if isinstance(result, tuple) and len(result) == 2:
                        X, y = result
                    else:
                        X = result
                except TypeError:
                    # Transformer doesn't accept y
                    X = transform.fit_transform(X)
            else:
                X = transform.fit(X, y).transform(X)
        return self

    def transform(self, X, y=None):
        for name, transform in self.steps:
            if hasattr(transform, 'transform'):
                try:
                    result = transform.transform(X, y)
                    if isinstance(result, tuple) and len(result) == 2:
                        X, y = result
                    else:
                        X = result
                except TypeError:
                    # Transformer doesn't accept y
                    X = transform.transform(X)
        if y is not None:
            return X, y
        else:
            return X




#-------- Defining input variables --------#

id = "id"
target = "churn"
senior_col = "customer_senior"

drop_columns = ["phone_subscription", "streaming_tv"]

excluded_columns = drop_columns + [id, target, senior_col]

cat_columns = ["customer_senior"] + [col for col in df.select_dtypes(include=['object']).columns if col not in excluded_columns]

num_columns = [col for col in df.select_dtypes(include=[np.number]).columns if col not in excluded_columns]


#-------- Defining the transformers --------#

cat_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder(sparse_output=False))
    ]
)

num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)


#-------- Defining the preparation pipeline --------#

preparation = ColumnTransformer(
    transformers=[
        ('col_drop', 'drop', drop_columns),
        ('cat', cat_transformer, cat_columns),
        ('num', num_transformer, num_columns)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

full_pipeline = PipelineWithY(
    steps=[
        ('drop_nan', DropNaN(columns_list=None, reset_index=True)),
        ('senior_status', SeniorStatusTransformer(column='customer_senior')),
        ('preparation', preparation)
    ]
)

# MODEL

### Training

Preparing the dataset with the Pipeline

In [31]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=[target])
y = df[target]
y = y.map({'No': 0, 'Yes': 1})

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
full_pipeline.fit(X_train, y_train)

# Transform the training data
X_train_transformed, y_train_transformed = full_pipeline.transform(X_train, y_train)

# Transform the test data
X_test_transformed, y_test_transformed = full_pipeline.transform(X_test, y_test)

# Verify the shapes
print("Training features shape:", X_train_transformed.shape)
print("Test features shape:", X_test_transformed.shape)


Training features shape: (5624, 42)
Test features shape: (1408, 42)


In [35]:
pd.DataFrame(X_train_transformed, columns=full_pipeline.named_steps['preparation'].get_feature_names_out())

Unnamed: 0,customer_senior_No,customer_senior_Yes,gender_Female,gender_Male,customer_partner_No,customer_partner_Yes,dependent_family_No,dependent_family_Yes,multiple_line_No,multiple_line_No phone service,...,paperless_billing_No,paperless_billing_Yes,payment_method_type_Bank transfer (automatic),payment_method_type_Credit card (automatic),payment_method_type_Electronic check,payment_method_type_Mailed check,month_tenure,amount_charges_monthly,amount_total_charges,id
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,-0.468357,-0.002049,-0.42173,4223-BKEOR
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.883777,1.073455,1.254246,6035-RIIOM
2,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-1.287832,-1.378428,-1.002101,3797-VTIDR
3,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-1.164911,0.175817,-0.907313,2568-BRGYX
4,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-1.04199,0.154207,-0.782078,4291-SHSBH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5619,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-1.287832,1.000314,-0.970483,0684-AOSIH
5620,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,-0.386409,0.870655,-0.041033,5982-PSMKW
5621,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-0.837121,-1.454893,-0.87722,8044-BGWPI
5622,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-0.837121,1.148258,-0.482116,7450-NWRTR


Training the model

### Results

Predictions

Feature importance in the model

### Performance

In [None]:
from sklearn.metrics import classification_report

