In [48]:
import pandas as pd

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
# print(df.shape)  # how many rows & columns

In [49]:
# print(df.head())  # see first few rows
# print(df.info())  # check data types
# print(df['Churn'].value_counts())  # see churn vs non-churn

## Clean the data

In [50]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

### Convert categorical variables

columns like gender, Contract, PaymentMethod -> one-hot encode.

In [54]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('customerID')  # drop ID
categorical_cols.remove('Churn')       # target

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

### Encode the target

In [55]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
# df['Churn'][:50]

### Split the features and target

In [59]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [65]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Choose baseline model

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

model = LogisticRegression(solver='saga', max_iter=4000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1035
           1       0.60      0.52      0.56       374

    accuracy                           0.78      1409
   macro avg       0.72      0.70      0.71      1409
weighted avg       0.77      0.78      0.78      1409

ROC-AUC: 0.8101810948358261


In [72]:
import joblib

# Save model
joblib.dump(model, 'Telco_Customer_Churn_Prediction/churn_model.pkl')

# Save scaler (if you scaled numeric features)
# joblib.dump(scaler, 'Telco_Customer_Churn_Prediction/scaler.pkl')

['Telco_Customer_Churn_Prediction/churn_model.pkl']

### Include scaling numeric features

In [73]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns to scale
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Initialize scaler
scaler = StandardScaler()

Fit scaler on training data and transform

In [74]:
# Fit scaler on X_train numeric columns
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

# Transform X_test using the same scaler
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [76]:
import joblib

# Save scaler to use in Flask API
joblib.dump(scaler, 'Telco_Customer_Churn_Prediction/scaler.pkl')

['scaler.pkl']