# Task 2 — End-to-End ML Pipeline for Telco Churn

**Objective:** Build a production-ready sklearn Pipeline for customer churn prediction (Telco dataset).

## 1) Install & Imports

In [None]:
# !pip install -q scikit-learn pandas matplotlib joblib
import os, json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib


## 2) Load Dataset (place `telco_churn.csv` in the same folder)

In [None]:
df = pd.read_csv('telco_churn.csv')
df.head()


## 3) Preprocessing & Pipeline

In [None]:
df['TotalCharges'] = pd.to_numeric(df.get('TotalCharges', pd.Series()), errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median()) if 'TotalCharges' in df.columns else df.get('TotalCharges')
target = 'Churn'
y = (df[target].astype(str).str.lower().map({'yes':1,'no':0}))
X = df.drop(columns=[target])
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols)
])
pipe = Pipeline([('prep', preprocessor), ('clf', LogisticRegression(max_iter=400))])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


## 4) Hyperparameter Tuning (GridSearchCV)

In [None]:
from sklearn.model_selection import GridSearchCV
model_rf = Pipeline([('prep', preprocessor), ('clf', RandomForestClassifier(random_state=42))])
param_grid = {'clf__n_estimators': [100], 'clf__max_depth': [None, 8]}
gs = GridSearchCV(model_rf, param_grid, cv=3, scoring='f1', n_jobs=-1)
gs.fit(X_train, y_train)
print('Best CV score:', gs.best_score_)
best_model = gs.best_estimator_
pred_best = best_model.predict(X_test)
print('Accuracy (best):', accuracy_score(y_test, pred_best))
print(classification_report(y_test, pred_best))
joblib.dump(best_model, 'best_pipeline.joblib')


## 5) Visualizations & Summary

- Visualize feature importances from RandomForest (if available).
- Save pipeline for deployment with `joblib`.
