In [1]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("../data/raw/telco-customer-churn.csv")

# TotalCharges fix to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

In [3]:
X = df.drop("Churn", axis=1)
y = df["Churn"].map({"Yes":1, "No":0})

In [4]:
num_cols = X.select_dtypes(include=["float64","int64"]).columns.to_list()
cat_cols = X.select_dtypes(include=["object"]).columns.to_list()

In [5]:
X = X.drop("customerID", axis=1)
cat_cols.remove("customerID") if "customerID" in cat_cols else None

In [6]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [7]:
numerical_transformer = StandardScaler()

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [14]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [15]:
# To fix class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

In [16]:
pd.Series(y_train_resampled).value_counts()

Churn
0    4139
1    4139
Name: count, dtype: int64