## Customer Churn Prediction Model

Using the [Telco customer churn data](https://www.kaggle.com/code/mechatronixs/telco-churn-prediction-feature-engineering-eda/data) from kaggle, train a machine learning model to predict customer churn.

In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import warnings
from sklearn.base import TransformerMixin, BaseEstimator
import xgboost as xgb
from sklearn.metrics import mean_absolute_error

In [5]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training 
train.drop("customerID", axis=1, inplace=True)

# getting validation data
val = pd.read_csv("./data/validation_data.csv")

In [51]:
train.dropna()
trainY = train["Churn"].map({'Yes':1,'No':0})
trainX = train.drop("Churn", axis=1)

val.dropna()
valX = val.drop("Churn", axis=1)
valY = val["Churn"]

valY.head()

0    No
1    No
2    No
3    No
4    No
Name: Churn, dtype: object

In [11]:
def best_pipeline_intown(trainX):

    all_numerical_features = trainX.select_dtypes(include=["int64", "float64"]).columns
    all_categorical_features = trainX.select_dtypes(include=[object]).columns
    numerical_features = [value for value in all_numerical_features]
    categorical_features = [value for value in all_categorical_features]

    # numerical_features.remove("year")
    # categorical_features.append("year")

    # Preprocessing for numerical data
    numerical_transformer = Pipeline(
        steps=[
            ("imputer", KNNImputer(n_neighbors=5)),
            ("scaler", StandardScaler()),
        ]
    )

    # Preprocessing for categorical data
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    # Bundle Preprocessing for numerical and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numerical_features),
            ("cat", categorical_transformer, categorical_features),
        ],
        remainder="passthrough",
    )

    model = RandomForestClassifier(n_estimators=300, max_depth=5)
    xgbr = xgb.XGBRFClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        min_child_weight=5,
        alpha=0.3,
    )

    pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            # (
            #     "debugger",
            #     Debugger(),
            # ),
            ("model", xgbr),
        ]
    )

    return pipeline


In [52]:
pipeline = best_pipeline_intown(trainX)

pipeline.fit(trainX, trainY)

pred_Y = pipeline.predict(valX)

pred_Y.info()
# print(mean_absolute_error(valY, pred_Y))

# print("model score: %.3f" % pipeline.score(valX, valY))

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''