In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

df = pd.read_csv("../data/clean_telco.csv")

y = df['Churn']
X = df.drop('Churn', axis=1)

categorical = X.select_dtypes(include='object').columns
numeric = X.select_dtypes(include=['int64','float64']).columns

preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', 'passthrough', numeric)
])


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(solver='liblinear'),
    "Random Forest": RandomForestClassifier(n_estimators=200)
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ('prep', preprocess),
        ('model', model)
    ])
    
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")


Logistic Regression Accuracy: 0.8211
Random Forest Accuracy: 0.7970


In [4]:
best_model_name = max(results, key=results.get)
print("Best model:", best_model_name)

best_pipe = Pipeline([
    ('prep', preprocess),
    ('model', models[best_model_name])
])

best_pipe.fit(X_train, y_train)

joblib.dump(best_pipe, "../models/churn_model.pkl")
print("Model saved to ../models/churn_model.pkl")


Best model: Logistic Regression
Model saved to ../models/churn_model.pkl
