In [10]:
import os

In [11]:
%pwd

'c:\\telco churn project'

In [12]:
import os

# Set working directory to project root
os.chdir("c:/telco churn project")


In [13]:
%pwd

'c:\\telco churn project'

In [14]:
import yaml

with open("config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

config


{'artifacts_root': 'artifacts',
 'data_ingestion': {'root_dir': 'artifacts/data_ingestion',
  'source_file': 'data/WA_Fn-UseC_-Telco-Customer-Churn.csv',
  'raw_data': 'artifacts/data_ingestion/raw.csv',
  'train_data': 'artifacts/data_ingestion/train.csv',
  'test_data': 'artifacts/data_ingestion/test.csv',
  'test_size': 0.2,
  'random_state': 42},
 'data_transformation': {'root_dir': 'artifacts/data_transformation',
  'train_data_path': 'artifacts/data_ingestion/train.csv',
  'test_data_path': 'artifacts/data_ingestion/test.csv',
  'transformed_train_path': 'artifacts/data_transformation/train_transformed.csv',
  'transformed_test_path': 'artifacts/data_transformation/test_transformed.csv',
  'transformer_object_path': 'artifacts/data_transformation/transformer.pkl'}}

In [15]:
import pandas as pd
import numpy as np
import os


In [16]:
train_df = pd.read_csv(config["data_transformation"]["train_data_path"])
test_df = pd.read_csv(config["data_transformation"]["test_data_path"])

train_df.shape, test_df.shape


((5634, 21), (1409, 21))

In [17]:
# Separate categorical and numerical features
categorical_cols = train_df.select_dtypes(include="object").columns.tolist()
numerical_cols = train_df.select_dtypes(exclude="object").columns.tolist()

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)


Categorical Columns: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn']
Numerical Columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges']


In [18]:
# Check for missing values
print(train_df.isnull().sum())
print(test_df.isnull().sum())


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [19]:
train_df["TotalCharges"] = pd.to_numeric(train_df["TotalCharges"], errors='coerce')
test_df["TotalCharges"] = pd.to_numeric(test_df["TotalCharges"], errors='coerce')


In [20]:
train_df.dtypes


customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [21]:
train_ids = train_df["customerID"]
test_ids = test_df["customerID"]


In [22]:
train_df.drop(columns=["customerID"], inplace=True)
test_df.drop(columns=["customerID"], inplace=True)


In [23]:
# Split into features (X) and target (y)
X_train = train_df.drop(columns=["Churn"])
y_train = train_df["Churn"]

X_test = test_df.drop(columns=["Churn"])
y_test = test_df["Churn"]


In [24]:
categorical_cols = X_train.select_dtypes(include="object").columns.tolist()
numerical_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()


In [25]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define individual transformers
cat_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
num_transformer = StandardScaler()

# Combine them using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, numerical_cols),
        ("cat", cat_transformer, categorical_cols)
    ]
)


In [26]:
preprocessor.fit(X_train)


0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [27]:
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

X_train_transformed.shape, X_test_transformed.shape


((5634, 45), (1409, 45))

In [28]:
import pandas as pd

# Get feature names
cat_features = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)
num_features = numerical_cols

all_features = list(num_features) + list(cat_features)

# Create DataFrames
X_train_df = pd.DataFrame(X_train_transformed, columns=all_features)
X_test_df = pd.DataFrame(X_test_transformed, columns=all_features)

# Add back the target column
train_transformed = pd.concat([X_train_df, y_train.reset_index(drop=True)], axis=1)
test_transformed = pd.concat([X_test_df, y_test.reset_index(drop=True)], axis=1)


In [29]:
import joblib
import os

# Ensure the directory exists
os.makedirs(config["data_transformation"]["root_dir"], exist_ok=True)

# Save the DataFrames as CSV
train_transformed.to_csv(config["data_transformation"]["transformed_train_path"], index=False)
test_transformed.to_csv(config["data_transformation"]["transformed_test_path"], index=False)

# Save the transformer object
joblib.dump(preprocessor, config["data_transformation"]["transformer_object_path"])


['artifacts/data_transformation/transformer.pkl']