<a href="https://colab.research.google.com/github/marcorrea1/AAI2026/blob/main/Coding_Exercise_ML_Basics_Part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import os
import pandas as pd
from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create file path check
if not os.path.exists("/content/drive/MyDrive"):
    drive.mount("/content/drive")

# Load dataset
csv_path = "/content/drive/MyDrive/Coding Exercise - ML Basics/customer_churn_dataset-training-master.csv"
data = pd.read_csv(csv_path)

# Columns used
feature_cols = [
    "Age", "Gender", "Tenure", "Usage Frequency",
    "Support Calls", "Payment Delay", "Subscription Type", "Contract Length"
]
target_col = "Churn"

# Drop rows where target is missing
data = data.dropna(subset=[target_col]).copy()

X = data[feature_cols].copy()
y = data[target_col].copy()

# Split data (simple performance check)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Separate numeric and categorical columns
numeric_features = ["Age", "Tenure", "Usage Frequency", "Support Calls", "Payment Delay"]
categorical_features = ["Gender", "Subscription Type", "Contract Length"]

# Preprocessing
preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features)
    ]
)

# Build model
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Train model
model.fit(X_train, y_train)

# Model accuracy (single line)
accuracy = accuracy_score(y_test, model.predict(X_test))

# Example new customer
new_customer = pd.DataFrame({
    "Age": [30.0],
    "Gender": ["Female"],
    "Tenure": [39.0],
    "Usage Frequency": [14.0],
    "Support Calls": [5.0],
    "Payment Delay": [18.0],
    "Subscription Type": ["Standard"],
    "Contract Length": ["Annual"]
})

# Prediction
churn_prob = model.predict_proba(new_customer)[0][1]
churn_pred = int(churn_prob >= 0.50)


avg_numeric = data[numeric_features].mean()

avg_gender = data["Gender"].mode()[0]
avg_subscription = data["Subscription Type"].mode()[0]
avg_contract = data["Contract Length"].mode()[0]

# -----------------------
#  OUTPUT
# -----------------------
print(f"Model accuracy: {accuracy:.3f}\n")

print("Average customer values in the dataset:")
print(f"Age: {avg_numeric['Age']:.2f}")
print(f"Tenure: {avg_numeric['Tenure']:.2f}")
print(f"Usage Frequency: {avg_numeric['Usage Frequency']:.2f}")
print(f"Support Calls: {avg_numeric['Support Calls']:.2f}")
print(f"Payment Delay: {avg_numeric['Payment Delay']:.2f}")
print(f"Most common Gender: {avg_gender}")
print(f"Most common Subscription Type: {avg_subscription}")
print(f"Most common Contract Length: {avg_contract}\n")

print("New customer churn prediction:")
print(f"Churn probability: {churn_prob:.4f}")
print(f"Churn prediction (1 = churn, 0 = no churn): {churn_pred}")


Model accuracy: 0.853

Average customer values in the dataset:
Age: 39.37
Tenure: 31.26
Usage Frequency: 15.81
Support Calls: 3.60
Payment Delay: 12.97
Most common Gender: Male
Most common Subscription Type: Standard
Most common Contract Length: Annual

New customer churn prediction:
Churn probability: 0.8682
Churn prediction (1 = churn, 0 = no churn): 1



# Interpretation: Churn Probability and Business Use


Churn probability represents the likelihood that a customer will stop using the service. The values range from 0 to 1. A higher value indicates a greater likelihood of churn.


# Interpretation of Results

When I created an example customer, the output was 87%, meaning the customer is most likely to churn because this value is above the 0.5 threshold. By using a model like this, businesses can use the output to help identify high-risk customers and take the necessary action to prevent churn.


Link to Data: https://www.kaggle.com/datasets/muhammadshahidazeem/customer-churn-dataset?select=customer_churn_dataset-training-master.csv