In [3]:
import pandas as pd

data = pd.read_csv("churn.csv")
X = data.drop("Churn", axis=1)
y = data["Churn"].map({"Yes": 1, "No": 0})

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#Identifying column types:
numerical = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical = X.select_dtypes(include=["object"]).columns.tolist()

#Preprocessing blocks:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical),
    ("cat", categorical_transformer, categorical)
])


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

logreg_model = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

rf_model = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", RandomForestClassifier())
])


In [6]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "classifier__n_estimators": [50, 100],
    "classifier__max_depth": [5, 10]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring="accuracy", verbose=1)
grid_search.fit(X, y)
print("Best model:", grid_search.best_params_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best model: {'classifier__max_depth': 5, 'classifier__n_estimators': 50}


In [7]:
import joblib

joblib.dump(grid_search.best_estimator_, "churn_pipeline.pkl")


['churn_pipeline.pkl']