In [1]:
import pandas as pd
df = pd.read_csv("../data/processed/telco_customer_churn_cleaned.csv") 


In [2]:
# Columns that always cause leakage
leakage_cols = [
    "churn_label", 
    "churn_score", 
    "cltv", 
    "churn_reason",
    "count"
]

# Columns that add no predictive value
drop_geo = [
    "country", "state", "city", "zip_code",
    "lat_long", "latitude", "longitude"
]

df_ml = df.drop(columns=leakage_cols + drop_geo)


In [3]:
y = df_ml["churn_value"]
x = df_ml.drop(columns=["churn_value", "customerid"])

x = pd.get_dummies(x, drop_first=True)



In [4]:
x.to_csv("../data/processed/telco_features.csv", index=False)
y.to_csv("../data/processed/telco_target.csv", index=False)



In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

x_train.shape, x_test.shape, y_train.shape, y_test.shape


((5625, 30), (1407, 30), (5625,), (1407,))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

#1: Logistic Regression
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(x_train, y_train)

y_pred_lr = log_reg.predict(x_test)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


#2: Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(x_train, y_train)

y_pred_rf = rf.predict(x_test)

print("\nRandom Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
