### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib


### Load Dataset

In [3]:
# Load the Telco Customer Churn dataset
df = pd.read_csv("Telco-Customer-Churn.csv")

# Preview the data
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Data Cleaning

In [5]:
# Drop customerID as it's not useful for prediction
df.drop('customerID', axis=1, inplace=True)

# Convert TotalCharges to numeric (some non-numeric values may exist)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Handle missing values
df.dropna(inplace=True)

# Encode target variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})


### Feature and Target Split

In [7]:
X = df.drop('Churn', axis=1)
y = df['Churn']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Define Preprocessing Pipelines

In [9]:
# Separate columns by type
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = [col for col in X.columns if col not in numeric_features]

# Create column transformers
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


### Create Pipelines for Logistic Regression and Random Forest

In [11]:
# Logistic Regression Pipeline
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])


In [13]:
logreg_param_grid = {
    'classifier__C': [0.1, 1.0, 10.0]
}

rf_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20]
}


In [15]:
grid_logreg = GridSearchCV(logreg_pipeline, logreg_param_grid, cv=5, scoring='accuracy')
grid_logreg.fit(X_train, y_train)

print("Best Parameters (Logistic Regression):", grid_logreg.best_params_)


Best Parameters (Logistic Regression): {'classifier__C': 10.0}


In [17]:
grid_rf = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)

print("Best Parameters (Random Forest):", grid_rf.best_params_)


Best Parameters (Random Forest): {'classifier__max_depth': 10, 'classifier__n_estimators': 200}


In [19]:
# Choose the better model based on validation score
best_model = grid_rf if grid_rf.best_score_ > grid_logreg.best_score_ else grid_logreg

# Evaluate on test set
y_pred = best_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1033
           1       0.62      0.52      0.57       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [21]:
# Save the final pipeline
joblib.dump(best_model.best_estimator_, "best_churn_pipeline.joblib")
print("Pipeline saved as 'best_churn_pipeline.joblib'")


Pipeline saved as 'best_churn_pipeline.joblib'
