In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the cleaned dataset
df = pd.read_csv('../data/telco_churn_cleaned.csv')

# Display the first 5 rows
print("Cleaned Dataset (First 5 Rows):")
print(df.head())

# Encode categorical variables using LabelEncoder
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
                       'PaperlessBilling', 'PaymentMethod']

label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Save the encoder for future use

# Display the first 5 rows after encoding
print("\nDataset After Encoding (First 5 Rows):")
print(df.head())

# Save the encoded dataset to a new file
df.to_csv('../data/telco_churn_encoded.csv', index=False)
print("\nEncoded dataset saved to '../data/telco_churn_encoded.csv'.")

Cleaned Dataset (First 5 Rows):
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV Stre

In [2]:
from sklearn.preprocessing import StandardScaler

# Scale numerical features
numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Display the first 5 rows after scaling
print("\nDataset After Scaling (First 5 Rows):")
print(df.head())

# Save the scaled dataset to a new file
df.to_csv('../data/telco_churn_scaled.csv', index=False)
print("\nScaled dataset saved to '../data/telco_churn_scaled.csv'.")


Dataset After Scaling (First 5 Rows):
   customerID  gender  SeniorCitizen  Partner  Dependents    tenure  \
0  7590-VHVEG       0              0        1           0 -1.277445   
1  5575-GNVDE       1              0        0           0  0.066327   
2  3668-QPYBK       1              0        0           0 -1.236724   
3  7795-CFOCW       1              0        0           0  0.514251   
4  9237-HQITU       0              0        0           0 -1.236724   

   PhoneService  MultipleLines  InternetService  OnlineSecurity  ...  \
0             0              1                0               0  ...   
1             1              0                0               2  ...   
2             1              0                0               2  ...   
3             0              1                0               2  ...   
4             1              0                1               0  ...   

   DeviceProtection  TechSupport  StreamingTV  StreamingMovies  Contract  \
0                 0      