<a href="https://colab.research.google.com/github/ondennis03/AAI2026/blob/main/CodingExercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Part 1
# Data source: ChatGPT prompts (300 rows)
# Data source file name: house_prices_with_location_v2
# location column was randomly generated with categories: Downtown, Rural, Suburb

# Load data (use the correct filename/path where YOU saved it)
df = pd.read_csv("house_prices_with_location_v2.csv")

# Features + target (names match the CSV exactly)
X = df[["square_footage", "location"]]
y = df["price"]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Encode location (drop first category so location effects are relative to the baseline)
preprocess = ColumnTransformer(
    transformers=[
        ("loc", OneHotEncoder(handle_unknown="ignore", drop="first"), ["location"])
    ],
    remainder="passthrough"  # keeps square_footage
)

# Pipeline: preprocessing + linear regression
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

# Train
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("=== Part 1: House Price Prediction ===")
print(f"Test MAE: ${mae:,.0f}")
print(f"Test R^2: {r2:.3f}")

# Predict required example
new_house = pd.DataFrame([{"square_footage": 2000, "location": "Downtown"}])
pred_price = model.predict(new_house)[0]
print(f"\nPredicted price for 2000 sq ft in Downtown: ${pred_price:,.0f}")

# Print coefficients (impact of each feature)
ohe = model.named_steps["preprocess"].named_transformers_["loc"]
location_feature_names = ohe.get_feature_names_out(["location"])

# Because remainder="passthrough", square_footage is appended after encoded columns
feature_names = list(location_feature_names) + ["square_footage"]

coeffs = model.named_steps["regressor"].coef_
intercept = model.named_steps["regressor"].intercept_

print("\nIntercept:", f"${intercept:,.2f}")
print("\nCoefficients:")
for name, c in zip(feature_names, coeffs):
    print(f"  {name}: {c:,.2f}")

# Quick interpretation helper (baseline is the dropped category)
baseline_location = ohe.categories_[0][0]  # the first category dropped by OneHotEncoder
print(f"\nNOTE: Location coefficients are relative to baseline location = '{baseline_location}'.")


=== Part 1: House Price Prediction ===
Test MAE: $32,165
Test R^2: 0.858

Predicted price for 2000 sq ft in Downtown: $633,655

Intercept: $206,521.32

Coefficients:
  location_Rural: -165,505.19
  location_Suburb: -82,482.15
  square_footage: 213.57

NOTE: Location coefficients are relative to baseline location = 'Downtown'.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Part 2
# Data source: ChatGPT prompts (500 rows)
# Data source file name: customer_churn_dataset
# Columns: age, monthly_usage, purchase_amount, customer_service_calls, region, churn

# 1) Load dataset
df = pd.read_csv("customer_churn_dataset.csv")

# 2) Ensure required columns exist
required_cols = ["age", "monthly_usage", "purchase_amount", "customer_service_calls", "region", "churn"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Drop rows with missing required values (basic cleaning step)
df = df.dropna(subset=required_cols)

# 3) Features / target
X = df[["age", "monthly_usage", "purchase_amount", "customer_service_calls", "region"]]
y = df["churn"]

# 4) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5) Preprocessing (CHECKLIST ITEMS)
numeric_features = ["age", "monthly_usage", "purchase_amount", "customer_service_calls"]
categorical_features = ["region"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),                       # ✅ StandardScaler
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)  # ✅ OneHotEncoder
    ]
)

# 6) Model (CHECKLIST ITEM)
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Train logistic regression model ✅
model.fit(X_train, y_train)

# 7) Evaluate (not required by your checklist, but good practice)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # churn probability ✅

print("=== Part 2: Customer Churn Prediction ===")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.3f}")
print(f"Recall:    {recall_score(y_test, y_pred, zero_division=0):.3f}")
print(f"ROC AUC:   {roc_auc_score(y_test, y_proba):.3f}")
print("\nConfusion Matrix [ [TN FP] [FN TP] ]:")
print(confusion_matrix(y_test, y_pred))

# 8) Predict churn probability for a new customer + classify using 0.5 threshold ✅
new_customer = pd.DataFrame([{
    "age": 28,
    "monthly_usage": 12,
    "purchase_amount": 70,
    "customer_service_calls": 3,
    "region": "East"
}])

new_proba = model.predict_proba(new_customer)[:, 1][0]  # ✅ churn probability output
threshold = 0.5
new_class = int(new_proba >= threshold)                  # ✅ 0.5 threshold classification

print("\nNew customer churn probability:", f"{new_proba:.3f}")
print(f"At-risk (threshold {threshold})? churn =", new_class)

# 9) Interpretation (CHECKLIST ITEMS)
print("\nInterpretation:")
print("- The churn probability is the model’s estimate of the chance that this customer will churn.")
print("  Example: 0.70 means about a 70% chance the customer will leave.")
print("- Businesses can use this to reduce churn by targeting high-risk customers (probability >= 0.5)")
print("  with retention actions like discounts, proactive support outreach, plan changes, or loyalty offers.")


=== Part 2: Customer Churn Prediction ===
Accuracy:  0.760
Precision: 0.760
Recall:    0.760
ROC AUC:   0.847

Confusion Matrix [ [TN FP] [FN TP] ]:
[[38 12]
 [12 38]]

New customer churn probability: 0.751
At-risk (threshold 0.5)? churn = 1

Interpretation:
- The churn probability is the model’s estimate of the chance that this customer will churn.
  Example: 0.70 means about a 70% chance the customer will leave.
- Businesses can use this to reduce churn by targeting high-risk customers (probability >= 0.5)
  with retention actions like discounts, proactive support outreach, plan changes, or loyalty offers.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans

# Part 3
# Data source: ChatGPT prompts (500 rows)
# Data source file name: customer_segmentation_dataset
# Columns: annual_spending, purchase_frequency, age, region

# 1) Load dataset
df = pd.read_csv("customer_segmentation_dataset.csv")

# 2) Basic checks / cleaning
required_cols = ["annual_spending", "purchase_frequency", "age", "region"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df = df.dropna(subset=required_cols)
X = df[required_cols]

# 3) Preprocessing: scale numeric + encode region
numeric_features = ["annual_spending", "purchase_frequency", "age"]
categorical_features = ["region"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),                      # ✅ StandardScaler
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), categorical_features)  # ✅ OneHotEncoder
    ]
)

Xt = preprocess.fit_transform(X)

# Convert to dense for speed (small dataset)
Xt = Xt.toarray() if sparse.issparse(Xt) else Xt

# 4) Elbow method (K=1..5) + plot inertia
inertias = []
k_range = range(1, 6)

for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=5, max_iter=200)
    km.fit(Xt)
    inertias.append(km.inertia_)

plt.figure()
plt.plot(list(k_range), inertias, marker="o")
plt.xlabel("Number of clusters (K)")
plt.ylabel("Inertia")
plt.title("Elbow Method for K-Means (Customer Segmentation)")
plt.grid(True)

elbow_path = "elbow_plot.png"
plt.savefig(elbow_path, dpi=200, bbox_inches="tight")
plt.close()

# 5) Fit final K-Means with K=3 (typical elbow for this synthetic dataset)
K = 3
kmeans_final = KMeans(n_clusters=K, random_state=42, n_init=10, max_iter=300)
clusters = kmeans_final.fit_predict(Xt)

# 6) Save cluster assignments to CSV
df_out = df.copy()
df_out["cluster"] = clusters
out_csv = "customer_segmentation_with_clusters.csv"
df_out.to_csv(out_csv, index=False)

# 7) Cluster analysis (means + region distribution)
cluster_means = df_out.groupby("cluster")[numeric_features].mean().round(2)
cluster_region = pd.crosstab(df_out["cluster"], df_out["region"], normalize="index").round(3)

print("=== Part 3: Customer Segmentation (K-Means) ===")
print("\nCluster Means (numeric):")
print(cluster_means)
print("\nRegion Distribution per Cluster (proportions):")
print(cluster_region)

# 8) Suggested marketing strategies based on cluster patterns
print("\nSuggested Marketing Strategies:")
for c in cluster_means.index:
    spend = cluster_means.loc[c, "annual_spending"]
    freq = cluster_means.loc[c, "purchase_frequency"]

    if spend > 8000 and freq > 25:
        strategy = "High-value loyalists: VIP rewards, early access, exclusive bundles."
    elif spend > 3000 and freq > 12:
        strategy = "Core customers: personalized recommendations, points boosters, cross-sell."
    else:
        strategy = "Budget/low-engagement: discounts, onboarding tips, reactivation campaigns."

    print(f"  Cluster {c}: {strategy}")

print("\nSaved clustered dataset to:", out_csv)
print("Saved elbow plot to:", elbow_path)


=== Part 3: Customer Segmentation (K-Means) ===

Cluster Means (numeric):
         annual_spending  purchase_frequency    age
cluster                                            
0               12025.59               32.94  47.06
1                2869.41               12.89  34.83
2                3632.79               15.34  67.95

Region Distribution per Cluster (proportions):
region    East  North  South   West
cluster                            
0        0.222  0.361  0.222  0.194
1        0.200  0.228  0.302  0.270
2        0.254  0.249  0.282  0.215

Suggested Marketing Strategies:
  Cluster 0: High-value loyalists: VIP rewards, early access, exclusive bundles.
  Cluster 1: Budget/low-engagement: discounts, onboarding tips, reactivation campaigns.
  Cluster 2: Core customers: personalized recommendations, points boosters, cross-sell.

Saved clustered dataset to: customer_segmentation_with_clusters.csv
Saved elbow plot to: elbow_plot.png
