In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [2]:
df = pd.read_csv(r"C:\Users\R Y Z E N\Desktop\5a\WA_Fn-UseC_-Telco-Customer-Churn (1).csv")


In [3]:
# 2. BASIC CLEANING
# Remove customerID column (not useful for prediction)
df.drop("customerID", axis=1, inplace=True)

# Convert 'TotalCharges' to numeric (it may have spaces)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')

# Fill missing numeric values with 0
df.fillna(0, inplace=True)

# Encode target variable: "Yes"=1, "No"=0
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

In [4]:
# 3. SPLIT FEATURES & TARGET
# ---------------------------
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Identify categorical and numeric columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns


In [5]:
# 4. PREPROCESSING PIPELINE
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [6]:
# 5. DEFINE MODELS + PARAM GRIDS
models = {
    "logreg": (
        LogisticRegression(max_iter=1000, solver="liblinear"),
        {"classifier__C": [0.1, 1, 10]}
    ),
    "rf": (
        RandomForestClassifier(random_state=42),
        {
            "classifier__n_estimators": [100, 200],
            "classifier__max_depth": [5, 10, None]
        }
    )
}

In [7]:
# 6. TRAIN + SAVE EACH MODEL
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for name, (model, param_grid) in models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])

    grid = GridSearchCV(pipe, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)

In [8]:
# 7. EVALUATION
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print(f"\n🔹 Model: {name}")
print("Best Params:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))




🔹 Model: rf
Best Params: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
Accuracy: 0.8069552874378992
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87      1036
           1       0.67      0.52      0.59       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.73      1409
weighted avg       0.80      0.81      0.80      1409



In [9]:
# 8. SAVE PIPELINE
filename = f"{name}_churn_pipeline.pkl"
joblib.dump(best_model, filename)
print(f"✅ Saved {name} pipeline as {filename}")


✅ Saved rf pipeline as rf_churn_pipeline.pkl
