# ML Zoomcamp 2025 — Week 03: Classification Homework

This notebook solves the homework in `cohorts/2025/03-classification/homework.md`.

Dataset: Bank Marketing (course lead scoring)\n
- URL: https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv


In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', 100)


In [12]:
DATA_URL = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
DATA_FILE = 'course_lead_scoring.csv'

if os.path.exists(DATA_FILE):
    df = pd.read_csv(DATA_FILE)
else:
    try:
        df = pd.read_csv(DATA_URL)
    except Exception as e:
        raise RuntimeError(f'Failed to load dataset from URL. If offline, download {DATA_URL} to {DATA_FILE}. Error: {e}')

print(df.shape)
df.head(3)


(1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1


In [22]:
# Data preparation
target = 'converted'
assert target in df.columns, f'Missing target column {target}'

# Identify feature columns
feature_cols = [c for c in df.columns if c != target]

# Split features by type
numeric_cols = df[feature_cols].select_dtypes(include=['number']).columns.tolist()
categorical_cols = df[feature_cols].select_dtypes(include=['object','category']).columns.tolist()

# Fill missing values per instructions
if categorical_cols:
    df[categorical_cols] = df[categorical_cols].fillna('NA')
if numeric_cols:
    df[numeric_cols] = df[numeric_cols].fillna(0.0)

# Quick NA check (features only)
na_counts = df[feature_cols].isna().sum().sort_values(ascending=False)
print('Missing values per feature (after filling):')
print(na_counts.head(10))


Missing values per feature (after filling):
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
dtype: int64


In [31]:
# Question 1: Mode of industry
industry_mode = df['industry'].mode(dropna=False)[0] if 'industry' in df.columns else None
print('Q1 — Mode of industry:', industry_mode)


Q1 — Mode of industry: retail


In [39]:
# Question 2: Correlation matrix (numerical) and specified pairs
corr = df[numeric_cols].corr(numeric_only=True) if numeric_cols else pd.DataFrame()
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count'),
]

def corr_for(a, b):
    if a in corr.columns and b in corr.columns:
        return corr.loc[a, b]
    return np.nan

corr_results = {}
for a, b in pairs:
    corr_ab = corr_for(a, b)
    corr_results[(a, b)] = corr_ab

best_pair = max(corr_results.items(), key=lambda kv: (0 if pd.isna(kv[1]) else abs(kv[1])))[0]
print('Q2 — Pair with highest correlation among given:', best_pair)
print('Correlations:', {f'{a} & {b}': None if pd.isna(v) else float(v) for (a,b), v in corr_results.items()})


Q2 — Pair with highest correlation among given: ('annual_income', 'interaction_count')
Correlations: {'interaction_count & lead_score': 0.009888182496913105, 'number_of_courses_viewed & lead_score': -0.004878998354681265, 'number_of_courses_viewed & interaction_count': -0.023565222882888055, 'annual_income & interaction_count': 0.027036472404814396}


In [46]:
# Split the data: 60/20/20 (train/val/test)
X = df[feature_cols].copy()
y_raw = df[target]
try:
    y = y_raw.astype(int).values
except Exception:
    y_num = pd.to_numeric(y_raw, errors='coerce')
    if y_num.isna().any():
        y = y_raw.astype(str).str.lower().map({'yes':1, 'no':0, 'true':1, 'false':0, '1':1, '0':0}).astype(int).values
    else:
        y = y_num.astype(int).values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print('Shapes:', X_train.shape, X_val.shape, X_test.shape)


Shapes: (877, 8) (292, 8) (293, 8)


In [52]:
# Question 3: Mutual information (train, categorical features)
cat_cols_train = [c for c in categorical_cols if c in X_train.columns]
mi_scores = {}
for col in cat_cols_train:
    x, _ = pd.factorize(X_train[col])
    x = x.reshape(-1, 1)
    mi = mutual_info_classif(x, y_train, discrete_features=True, random_state=42)[0]
    mi_scores[col] = mi

mi_scores_rounded = {k: round(float(v), 2) for k, v in mi_scores.items()}
options_q3 = ['industry', 'location', 'lead_source', 'employment_status']
present_q3 = [c for c in options_q3 if c in mi_scores]
best_q3 = max(present_q3, key=lambda c: mi_scores[c]) if present_q3 else None

print('Q3 — MI scores (train, categorical, rounded to 2):')
print({k: mi_scores_rounded.get(k, None) for k in options_q3})
print('Q3 — Variable with biggest MI among options:', best_q3)


Q3 — MI scores (train, categorical, rounded to 2):
{'industry': 0.02, 'location': 0.0, 'lead_source': 0.03, 'employment_status': 0.02}
Q3 — Variable with biggest MI among options: lead_source


In [57]:
# Question 4: Logistic Regression with one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', [c for c in numeric_cols if c in X_train.columns]),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), [c for c in categorical_cols if c in X_train.columns]),
    ],
    remainder='drop'
)

lr = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
clf = Pipeline(steps=[('preprocess', preprocessor), ('lr', lr)])
clf.fit(X_train, y_train)
val_pred = clf.predict(X_val)
acc_val = accuracy_score(y_val, val_pred)
print('Q4 — Validation accuracy (rounded to 2):', round(acc_val, 2))


Q4 — Validation accuracy (rounded to 2): 0.74


In [61]:
# Question 5: Feature elimination (difference = baseline - acc_without_feature)
features_to_test = ['industry', 'employment_status', 'lead_score']

def train_val_acc_dropping(feature_to_drop=None):
    num_cols = [c for c in numeric_cols if c in X_train.columns and c != feature_to_drop]
    cat_cols = [c for c in categorical_cols if c in X_train.columns and c != feature_to_drop]
    pre = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', num_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
        ],
        remainder='drop'
    )
    model = Pipeline(steps=[('preprocess', pre), ('lr', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))])
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return accuracy_score(y_val, preds)

baseline_acc = acc_val
drop_results = {}
for f in features_to_test:
    acc_wo = train_val_acc_dropping(f)
    diff = baseline_acc - acc_wo
    drop_results[f] = {'acc_wo': acc_wo, 'diff': diff}

smallest_diff_feature = min(drop_results.items(), key=lambda kv: kv[1]['diff'])[0]
print('Q5 — Baseline val acc:', baseline_acc)
print('Q5 — Acc without feature and differences:', {k: {'acc_wo': round(v['acc_wo'], 4), 'diff': round(v['diff'], 4)} for k, v in drop_results.items()})
print('Q5 — Feature with smallest difference:', smallest_diff_feature)


Q5 — Baseline val acc: 0.7431506849315068
Q5 — Acc without feature and differences: {'industry': {'acc_wo': 0.7432, 'diff': 0.0}, 'employment_status': {'acc_wo': 0.7466, 'diff': -0.0034}, 'lead_score': {'acc_wo': 0.7432, 'diff': 0.0}}
Q5 — Feature with smallest difference: employment_status


In [64]:
# Question 6: Regularized logistic regression sweep over C
C_values = [0.01, 0.1, 1, 10, 100]
val_scores = {}
for c in C_values:
    lr_c = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    pipe_c = Pipeline(steps=[('preprocess', preprocessor), ('lr', lr_c)])
    pipe_c.fit(X_train, y_train)
    pred_c = pipe_c.predict(X_val)
    acc_c = accuracy_score(y_val, pred_c)
    val_scores[c] = acc_c

val_scores_rounded = {c: round(float(acc), 3) for c, acc in val_scores.items()}
best_acc = max(val_scores.values())
best_candidates = [c for c, acc in val_scores.items() if acc == best_acc]
best_c = min(best_candidates)
print('Q6 — Validation accuracies (rounded to 3):', val_scores_rounded)
print('Q6 — Best C:', best_c)


Q6 — Validation accuracies (rounded to 3): {0.01: 0.743, 0.1: 0.743, 1: 0.743, 10: 0.743, 100: 0.743}
Q6 — Best C: 0.01


In [66]:
# Summary of answers
answers = {
    'Q1_mode_industry': industry_mode,
    'Q2_best_pair': best_pair,
    'Q3_best_mi_var': best_q3,
    'Q4_val_accuracy_2dp': round(acc_val, 2),
    'Q5_smallest_diff_feature': smallest_diff_feature,
    'Q6_best_C': best_c,
}
answers


{'Q1_mode_industry': 'retail',
 'Q2_best_pair': ('annual_income', 'interaction_count'),
 'Q3_best_mi_var': 'lead_source',
 'Q4_val_accuracy_2dp': 0.74,
 'Q5_smallest_diff_feature': 'employment_status',
 'Q6_best_C': 0.01}