In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/Telco-Customer-Churn.csv")

# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check missing values
df.isnull().sum()


customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [5]:
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
df.isnull().sum()

C:\Users\Mohamad\AppData\Local\Temp\ipykernel_19392\2589325273.py:1: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [8]:
# Drop identifier column
df = df.drop(columns=['customerID'])

# Separate features and target
X = df.drop(columns=['Churn'])
y = df['Churn']

KeyError: "['customerID'] not found in axis"

In [9]:
X.shape, y.shape


((7043, 19), (7043,))

In [12]:
## The customerID column was removed from the data due to its identifying nature and lack of predictive value. The features (X) and target variable (y) were then separated from each other.

In [13]:
categorical_features = X.select_dtypes(include='object').columns.tolist()
numerical_features = X.select_dtypes(exclude='object').columns.tolist()

categorical_features, numerical_features

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_features = X.select_dtypes(include='object').columns.tolist()


(['gender',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod'],
 ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'])

In [14]:
X_encoded = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [15]:
X_encoded.shape

(7043, 30)

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_encoded[numerical_features] = scaler.fit_transform(
    X_encoded[numerical_features]
)

In [19]:
## Numerical features were normalized using StandardScaler to equalize their scale and improve the performance of scale-sensitive models.

In [26]:
# Final safety check for missing values
X_encoded = X_encoded.fillna(0)

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [28]:
## The data was split with a ratio of 80% training and 20% testing, preserving the distribution of the target variable (stratify)

In [29]:
import os
import joblib

# Create processed data directory if it doesn't exist
os.makedirs("../data/processed", exist_ok=True)

joblib.dump(X_train, "../data/processed/X_train.pkl")
joblib.dump(X_test, "../data/processed/X_test.pkl")
joblib.dump(y_train, "../data/processed/y_train.pkl")
joblib.dump(y_test, "../data/processed/y_test.pkl")


['../data/processed/y_test.pkl']

In [30]:
X_train.isnull().sum().sum(), X_test.isnull().sum().sum()

(np.int64(0), np.int64(0))

In [32]:
## After encoding and scaling, a final check for missing values was performed and remaining values were replaced with zeros to ensure consistency with classical models