In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils.validation import check_array

from typing import List, Union, Optional
import pandas as pd
import numpy as np


#-------- Defining the custom transformers --------#

class DropNaN(BaseEstimator, TransformerMixin):
    def __init__(self, columns_list: Optional[List[str]] = None, reset_index: bool = False) -> None:
        """
        Transformer that drops rows containing NaN values in specified columns or all columns.

        Parameters
        ----------
        columns_list : list of str, optional (default=None)
            List of column names to check for NaN values. If None, all columns are checked.
        reset_index : bool, optional (default=False)
            If True, resets the index of the returned DataFrame after dropping rows.
        """
        self.columns_list = columns_list
        self.reset_index = reset_index

    def fit(self, X: Union[pd.DataFrame, np.ndarray], y=None) -> 'DropNaN':
        # Validate input
        X = self._validate_input(X)

        # Store column names
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X: Union[pd.DataFrame, np.ndarray], y=None) -> Union[pd.DataFrame, np.ndarray]:
        """
        Removes rows containing missing values in the specified columns or in all columns if none are specified.

        Parameters
        ----------
        X : {array-like, DataFrame}, shape (n_samples, n_features)
            The data to clean.

        y : array-like, shape (n_samples,), optional (default=None)
            Target values (ignored in this transformer).

        Returns
        -------
        X_transformed : same as input type
            `X` with rows containing missing values removed.

        y_transformed : same as y
            `y` with corresponding rows removed if `y` is provided.
        """
        is_array = isinstance(X, np.ndarray)
        X = self._validate_input(X)

        # Check for consistency in columns
        if X.columns.tolist() != self.feature_names_in_:
            raise ValueError("The columns in the input data during transform differ from those during fit.")

        # Determine columns to check for NaN
        cols_to_check = self.columns_list if self.columns_list else X.columns

        # Check if specified columns exist
        missing_cols = [col for col in cols_to_check if col not in X.columns]
        if missing_cols:
            raise ValueError(f"The following columns were not found: {missing_cols}")

        # Drop rows with NaN values in specified columns
        if y is not None:
            y = pd.Series(y, name='target') if not isinstance(y, pd.Series) else y
            Xy = pd.concat([X, y], axis=1)
            Xy_transformed = Xy.dropna(subset=cols_to_check)
            X_transformed = Xy_transformed.drop(columns=[y.name])
            y_transformed = Xy_transformed[y.name]

            # Reset index if required
            if self.reset_index:
                X_transformed.reset_index(drop=True, inplace=True)
                y_transformed.reset_index(drop=True, inplace=True)

            # Convert back to original type if input was an array
            if is_array:
                X_transformed = X_transformed.values
                y_transformed = y_transformed.values

            return X_transformed, y_transformed
        else:
            X_transformed = X.dropna(subset=cols_to_check)

            # Reset index if required
            if self.reset_index:
                X_transformed.reset_index(drop=True, inplace=True)

            # Convert back to original type if input was an array
            if is_array:
                X_transformed = X_transformed.values

            return X_transformed

    def _validate_input(self, X):
        # Validate X and convert to DataFrame if necessary
        if isinstance(X, pd.DataFrame):
            return X.copy()
        elif isinstance(X, np.ndarray):
            X = check_array(X, ensure_2d=True, allow_nd=False, dtype=None)
            return pd.DataFrame(X, columns=getattr(self, 'feature_names_in_', None))
        else:
            raise TypeError("Input must be a pandas DataFrame or a NumPy array.")

    def get_feature_names_out(self, input_features=None) -> List[str]:
        # Return the feature names
        return self.feature_names_in_ if input_features is None else input_features


class SeniorStatusTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column='customer_senior'):
        self.column = column

    def fit(self, X, y=None):
        return self  # Nothing to fit

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].map({0: 'No', 1: 'Yes'}).astype('object')
        return X


class PipelineWithY(Pipeline):
    def fit(self, X, y=None, **fit_params):
        for name, transform in self.steps:
            if hasattr(transform, 'fit_transform'):
                # Attempt to fit_transform with both X and y
                try:
                    result = transform.fit_transform(X, y)
                    if isinstance(result, tuple) and len(result) == 2:
                        X, y = result
                    else:
                        X = result
                except TypeError:
                    # Transformer doesn't accept y
                    X = transform.fit_transform(X)
            else:
                X = transform.fit(X, y).transform(X)
        return self

    def transform(self, X, y=None):
        for name, transform in self.steps:
            if hasattr(transform, 'transform'):
                try:
                    result = transform.transform(X, y)
                    if isinstance(result, tuple) and len(result) == 2:
                        X, y = result
                    else:
                        X = result
                except TypeError:
                    # Transformer doesn't accept y
                    X = transform.transform(X)
        if y is not None:
            return X, y
        else:
            return X


def create_full_pipeline(df):
    #-------- Defining input variables --------#
    id_col = "id"
    target_col = "churn"
    senior_col = "customer_senior"

    drop_columns = ["phone_subscription", "streaming_tv"]
    excluded_columns = drop_columns + [id_col, target_col, senior_col]

    # Identifying categorical and numerical columns to process
    cat_columns = [senior_col] + [col for col in df.select_dtypes(include=['object']).columns if col not in excluded_columns]
    num_columns = [col for col in df.select_dtypes(include=[np.number]).columns if col not in excluded_columns]

    #-------- Defining the transformers --------#
    cat_transformer = Pipeline(
        steps=[
            ('encoder', OneHotEncoder(sparse_output=False))
        ]
    )

    num_transformer = Pipeline(
        steps=[
            ('scaler', StandardScaler())
        ]
    )

    #-------- Defining the preparation pipeline --------#
    preparation = ColumnTransformer(
        transformers=[
            ('col_drop', 'drop', drop_columns),
            ('cat', cat_transformer, cat_columns),
            ('num', num_transformer, num_columns)
        ],
        remainder='passthrough',
        verbose_feature_names_out=False
    )

    #-------- Defining the full pipeline --------#
    full_pipeline = PipelineWithY(
        steps=[
            ('drop_nan', DropNaN(columns_list=None, reset_index=True)),
            ('senior_status', SeniorStatusTransformer(column=senior_col)),
            ('preparation', preparation)
        ]
    )

    return full_pipeline

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
import os

# Add the root directory to the path
#sys.path.append(os.path.abspath(".."))

#from utils.pipeline import create_full_pipeline

data = pd.read_csv("data.csv", delimiter=",")
full_pipeline = create_full_pipeline(data)

X = data.drop(columns=["churn"])
y = data["churn"]
y = y.map({'No': 0, 'Yes': 1})

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
full_pipeline.fit(X_train, y_train)

# Transform the training data
X_train_transformed, y_train_transformed = full_pipeline.transform(X_train, y_train)

# Transform the test data
X_test_transformed, y_test_transformed = full_pipeline.transform(X_test, y_test)

X_train_transformed = pd.DataFrame(X_train_transformed, columns=full_pipeline.named_steps['preparation'].get_feature_names_out())
#y_train_transformed = pd.DataFrame(y_train_transformed)

X_test_transformed = pd.DataFrame(X_test_transformed, columns=full_pipeline.named_steps['preparation'].get_feature_names_out())
#y_test_transformed = pd.DataFrame(y_test_transformed)

# Keep the ids
id_train = X_train_transformed["id"]
X_train_transformed.drop(columns="id", inplace=True)

id_test = X_test_transformed["id"]
X_test_transformed.drop(columns="id", inplace=True)

In [7]:
# Clustering on the full transformed dataset (combine train and test)
X_transformed = pd.concat([X_train_transformed, X_test_transformed], ignore_index=True)
id_column = pd.concat([id_train, id_test], ignore_index=True)

In [8]:
# Split back into train and test sets after clustering
X_train_transformed = X_transformed.iloc[:len(X_train_transformed), :]
X_test_transformed = X_transformed.iloc[len(X_train_transformed):, :]
# Convert all object columns in X_train_transformed and X_test_transformed to numeric
X_train_transformed = X_train_transformed.apply(pd.to_numeric, errors='coerce')
X_test_transformed = X_test_transformed.apply(pd.to_numeric, errors='coerce')

#### XG Boost with oversampling on the minority class ad Randomized Search

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

# Oversample the minority class with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_transformed, y_train_transformed)

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 300, 500],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.5, 1],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 2, 5]
}

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42, eval_metric="logloss")

# Use RandomizedSearchCV with SMOTE-balanced data
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,  # Adjust based on computational resources
    scoring='roc_auc',
    cv=5,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_res, y_res)

# Best parameters and performance
print("Best Parameters:", random_search.best_params_)
print("Best AUC-ROC Score:", random_search.best_score_)

# Evaluate on the test set
y_pred = random_search.best_estimator_.predict(X_test_transformed)
print("Test AUC-ROC:", roc_auc_score(y_test_transformed, random_search.best_estimator_.predict_proba(X_test_transformed)[:, 1]))
print(classification_report(y_test_transformed, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'subsample': 1.0, 'scale_pos_weight': 1, 'reg_lambda': 5, 'reg_alpha': 0, 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.8}
Best AUC-ROC Score: 0.9325652437465546
Test AUC-ROC: 0.8538990558340133
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      1035
           1       0.62      0.63      0.63       373

    accuracy                           0.80      1408
   macro avg       0.74      0.75      0.75      1408
weighted avg       0.80      0.80      0.80      1408



In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# Define models and parameter grids for grid search
models = {
    'Logistic Regression': {
        'model': LogisticRegression(solver='liblinear', random_state=42),
        'params': {
            'C': [0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 300, 500],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'SVM': {
        'model': SVC(probability=True, random_state=42),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(objective='binary:logistic', random_state=42, eval_metric="logloss"),
        'params': {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [4, 6, 8],
            'n_estimators': [100, 300, 500],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
    },
    'LightGBM': {
        'model': lgb.LGBMClassifier(objective='binary', random_state=42),
        'params': {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [-1, 10, 20],
            'n_estimators': [100, 300, 500],
            'num_leaves': [31, 40, 50],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0]
        }
    }
}

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



#### Different model

In [14]:
from sklearn.metrics import roc_auc_score, classification_report
import pandas as pd

# Store results
results = []

for model_name, mp in models.items():
    print(f"Running Grid Search for {model_name}...")
    grid_search = GridSearchCV(
        estimator=mp['model'],
        param_grid=mp['params'],
        scoring='roc_auc',
        cv=5,
        verbose=1,
        n_jobs=-1
    )

    # Fit the grid search to the data
    grid_search.fit(X_train_transformed, y_train_transformed)

    # Get the best estimator and performance
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_transformed)
    auc_score = roc_auc_score(y_test_transformed, best_model.predict_proba(X_test_transformed)[:, 1])

    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"{model_name} Test AUC-ROC: {auc_score}")
    print(f"{model_name} Classification Report:\n", classification_report(y_test_transformed, y_pred))

    # Save results
    results.append({
        'Model': model_name,
        'Best Parameters': grid_search.best_params_,
        'AUC-ROC': auc_score
    })

# Convert results to DataFrame for easy viewing
results_df = pd.DataFrame(results)
print(results_df)

Running Grid Search for Logistic Regression...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters for Logistic Regression: {'C': 10, 'penalty': 'l2'}
Logistic Regression Test AUC-ROC: 0.8623421533201227
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      1035
           1       0.69      0.60      0.64       373

    accuracy                           0.82      1408
   macro avg       0.78      0.75      0.76      1408
weighted avg       0.82      0.82      0.82      1408

Running Grid Search for Random Forest...
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 500}
Random Forest Test AUC-ROC: 0.862865394827162
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.91   

  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Number of positive: 1496, number of negative: 4128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 659
[LightGBM] [Info] Number of data points in the train set: 5624, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.266003 -> initscore=-1.014998
[LightGBM] [Info] Start training from score -1.014998
Best Parameters for LightGBM: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 300, 'num_leaves': 31, 'subsample': 0.8}
LightGBM Test AUC-ROC: 0.860832005802282
LightGBM Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.67      0.51      0.58       373

    accuracy                           0.81      1408
 