In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the Excel file
file_path = r"D:\Download\Customer_data.xlsx"
df = pd.read_excel(file_path)

# Drop missing values in 'TotalCharges' and convert to float
df.dropna(subset=["TotalCharges"], inplace=True)
df["TotalCharges"] = df["TotalCharges"].astype(float)

# Convert Yes/No columns to binary (1/0)
binary_yes_no_cols = [
    "Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"
]
internet_related_cols = [
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies", "MultipleLines"
]

for col in binary_yes_no_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})

for col in internet_related_cols:
    df[col] = df[col].map({
        "Yes": 1,
        "No": 0,
        "No internet service": 0,
        "No phone service": 0
    })

# Convert gender to binary
df["gender"] = df["gender"].map({"Female": 1, "Male": 0})

# One-hot encode multi-class categorical columns
df = pd.get_dummies(df, columns=["InternetService", "Contract", "PaymentMethod"], drop_first=True)

# Drop customerID
df.drop("customerID", axis=1, inplace=True)

# Split into features and target
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Output dataset shapes
print("✅ Preprocessing complete.")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


✅ Preprocessing complete.
X_train shape: (5625, 23)
X_test shape: (1407, 23)
y_train shape: (5625,)
y_test shape: (1407,)


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("✅ Evaluation Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}\n")

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


✅ Evaluation Metrics:
Accuracy  : 0.8038
Precision : 0.6476
Recall    : 0.5749
F1 Score  : 0.6091

Confusion Matrix:
[[916 117]
 [159 215]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1033
           1       0.65      0.57      0.61       374

    accuracy                           0.80      1407
   macro avg       0.75      0.73      0.74      1407
weighted avg       0.80      0.80      0.80      1407

