# Data Processing and Modeling Preparation

In [71]:
import pandas as pd
from sklearn.model_selection   import train_test_split
from sklearn.compose           import ColumnTransformer
from sklearn.preprocessing     import StandardScaler, OneHotEncoder
import joblib

In [72]:
# 1. Load EDA-processed dataset
df = pd.read_csv('../data/processed/Vodafone_Customer_Churn_PostEDAProcessed.csv') 

In [73]:
# check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

Series([], dtype: int64)

In [74]:
# Change target variable Churn to 0 and 1
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [75]:
# 2. Define target and features
target = 'Churn'
X = df.drop(columns=[target, 'customerID'])
y = df[target]

In [76]:
# 3. Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

In [77]:
# 4. Identify feature types
numeric_features     = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object','category']).columns.tolist()

In [78]:
# 5. Build & fit preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
])
preprocessor.fit(X_train)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [79]:
# 6. Transform train & test sets
X_train_processed = preprocessor.transform(X_train)
X_test_processed  = preprocessor.transform(X_test)

In [80]:
# 6. Convert back to DataFrame
feature_names = (
    numeric_features +
    preprocessor
      .named_transformers_['cat']
      .get_feature_names_out(categorical_features)
      .tolist()
)
X_train_processed = pd.DataFrame(
    X_train_processed,
    columns=feature_names,
    index=X_train.index
)
X_test_processed = pd.DataFrame(
    X_test_processed,
    columns=feature_names,
    index=X_test.index
)

In [81]:
# 7. Persist outputs for 03_modeling_and_evaluation.ipynb
joblib.dump(preprocessor,'../data/processed/preprocessor.joblib')
X_train_processed.to_csv('../data/processed/X_train_processed.csv', index=True)
X_test_processed.to_csv('../data/processed/X_test_processed.csv',  index=True)
y_train.to_csv('../data/processed/y_train.csv', index=True)
y_test.to_csv('../data/processed/y_test.csv', index=True)

# Verification Steps

In [82]:
# 1. Load artifacts
preprocessor = joblib.load('../data/processed/preprocessor.joblib')
X_train = pd.read_csv('../data/processed/X_train_processed.csv', index_col=0)
X_test  = pd.read_csv('../data/processed/X_test_processed.csv',  index_col=0)
y_train = pd.read_csv('../data/processed/y_train.csv', index_col=0).squeeze()
y_test  = pd.read_csv('../data/processed/y_test.csv', index_col=0).squeeze()

In [83]:
# 2. Check shapes
print("X_train:", X_train.shape)
print("X_test: ",  X_test.shape)
print("y_train:", y_train.shape)
print("y_test: ",  y_test.shape)

X_train: (5634, 46)
X_test:  (1409, 46)
y_train: (5634,)
y_test:  (1409,)


In [84]:
# 3. Peek at the data
display(X_train.head())
display(X_test.head())
print(y_train.value_counts(normalize=True))

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_Non-Senior,SeniorCitizen_Senior,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
3738,0.102371,-0.521976,-0.26229,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3151,-0.711743,0.337478,-0.503674,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4860,-0.793155,-0.809013,-0.749929,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3867,-0.26398,0.284384,-0.172753,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3810,-1.281624,-0.676279,-0.989426,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_Non-Senior,SeniorCitizen_Senior,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
437,1.608483,1.629976,2.706872,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2280,-0.996684,1.168725,-0.610302,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2235,0.346606,0.445324,0.4001,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4460,-0.589626,0.440347,-0.364487,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3761,1.608483,0.588013,1.588437,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


Churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64


In [85]:
# 4. Verify no missing values
print("Missing in X_train:", X_train.isna().sum().sum())
print("Missing in X_test: ", X_test.isna().sum().sum())

Missing in X_train: 0
Missing in X_test:  0


In [86]:
# 5. Spot-check scaling on a numeric column (mean≈0, std≈1)
col = X_train.columns[1]  # pick any scaled numeric
print(f"{col} → mean: {X_train[col].mean():.3f}, std: {X_train[col].std():.3f}")

MonthlyCharges → mean: -0.000, std: 1.000
