# ML Zoomcamp 2025 — Module 2: Regression — Homework Solutions

This notebook demonstrates how to solve each exercise in the homework using the Car Fuel Efficiency dataset.

Dataset URL: https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

In [1]:
import numpy as np
import pandas as pd
from urllib.error import URLError

SEED_DEFAULT = 42
DATA_URL = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
LOCAL_CSV = 'car_fuel_efficiency.csv'  # fallback if file is saved locally in the same folder
FEATURES = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
TARGET = 'fuel_efficiency_mpg'

def load_dataset(url=DATA_URL, local_csv=LOCAL_CSV):
    try:
        df = pd.read_csv(url)
    except URLError:
        df = pd.read_csv(local_csv)
    except Exception as e:
        print(f'Could not read from URL due to: {e}. Trying local file...')
        df = pd.read_csv(local_csv)
    return df.copy()

def to_numeric(df, columns):
    for c in columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
    return df

def rmse(y, y_pred):
    return np.sqrt(np.mean((y - y_pred) ** 2))

def split_train_val_test(df, seed=SEED_DEFAULT, frac_train=0.6, frac_val=0.2):
    n = len(df)
    idx = np.arange(n)
    rng = np.random.default_rng(seed)
    rng.shuffle(idx)
    n_train = int(n * frac_train)
    n_val = int(n * frac_val)
    train_idx = idx[:n_train]
    val_idx = idx[n_train:n_train + n_val]
    test_idx = idx[n_train + n_val:]

    df_train = df.iloc[train_idx].reset_index(drop=True)
    df_val = df.iloc[val_idx].reset_index(drop=True)
    df_test = df.iloc[test_idx].reset_index(drop=True)
    return df_train, df_val, df_test

def prepare_X(df, features, impute_strategy=None, impute_value=None):
    X = df[features].copy()
    if impute_strategy == 'zero':
        X = X.fillna(0)
    elif impute_strategy == 'mean':
        if impute_value is None:
            raise ValueError('impute_value (dict) required when using mean strategy')
        X = X.fillna(impute_value)
    return X.values

def train_linear_regression(X, y, r=0.0):
    # Add bias term
    ones = np.ones(X.shape[0])
    X_ = np.column_stack([ones, X])
    # Closed-form: w = (X^T X + rI)^(-1) X^T y; don't regularize bias
    XTX = X_.T.dot(X_)
    reg = r * np.eye(XTX.shape[0])
    reg[0, 0] = 0.0
    w_full = np.linalg.inv(XTX + reg).dot(X_.T).dot(y)
    w0 = w_full[0]
    w = w_full[1:]
    return w0, w

def predict(w0, w, X):
    return w0 + X.dot(w)

def nearest_option(value, options):
    # returns the option (as in the provided list) nearest to value
    return min(options, key=lambda o: abs(value - o))


In [2]:
# Load and prepare the dataset
df_raw = load_dataset()
df = df_raw.copy()
df = to_numeric(df, FEATURES + [TARGET])
df = df[FEATURES + [TARGET]].copy()
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


## EDA: Target distribution
Check whether the target `fuel_efficiency_mpg` has a long tail.

In [3]:
df[TARGET].describe(), df[TARGET].skew()

(count    9704.000000
 mean       14.985243
 std         2.556468
 min         6.200971
 25%        13.267459
 50%        15.006037
 75%        16.707965
 max        25.967222
 Name: fuel_efficiency_mpg, dtype: float64,
 np.float64(-0.012062219273507962))

## Q1. Column with missing values
Identify which feature column contains missing values.

In [4]:
na_counts = df[FEATURES].isna().sum().sort_values(ascending=False)
display(na_counts)
missing_feature = na_counts.index[0] if na_counts.iloc[0] > 0 else None
missing_feature

horsepower             708
engine_displacement      0
vehicle_weight           0
model_year               0
dtype: int64

'horsepower'

## Q2. Median horsepower
Compute the 50th percentile (median) of `horsepower`.

In [5]:
hp_median = float(df['horsepower'].median())
hp_median, int(round(hp_median))

(149.0, 149)

## Prepare and split data (seed=42)
Shuffle and split into train/val/test with 60%/20%/20%.

In [6]:
df_train, df_val, df_test = split_train_val_test(df, seed=SEED_DEFAULT)
len(df_train), len(df_val), len(df_test)

(5822, 1940, 1942)

## Q3. Missing value strategies: 0 vs mean
Train linear regression (no regularization) with two imputation strategies for the missing column from Q1, evaluate on validation, and compare RMSE.

In [7]:
# Zero fill
X_train_zero = prepare_X(df_train, FEATURES, impute_strategy='zero')
y_train = df_train[TARGET].values
w0_zero, w_zero = train_linear_regression(X_train_zero, y_train, r=0.0)
X_val_zero = prepare_X(df_val, FEATURES, impute_strategy='zero')
y_val = df_val[TARGET].values
y_pred_zero = predict(w0_zero, w_zero, X_val_zero)
rmse_zero = rmse(y_val, y_pred_zero)

# Mean fill (mean from train only)
train_means = df_train[FEATURES].mean().to_dict()
X_train_mean = prepare_X(df_train, FEATURES, impute_strategy='mean', impute_value=train_means)
w0_mean, w_mean = train_linear_regression(X_train_mean, y_train, r=0.0)
X_val_mean = prepare_X(df_val, FEATURES, impute_strategy='mean', impute_value=train_means)
y_pred_mean = predict(w0_mean, w_mean, X_val_mean)
rmse_mean = rmse(y_val, y_pred_mean)

print('RMSE zero-fill:', round(rmse_zero, 2))
print('RMSE mean-fill:', round(rmse_mean, 2))
better_q3 = 'With 0' if rmse_zero < rmse_mean else ('With mean' if rmse_mean < rmse_zero else 'Both are equally good')
better_q3

RMSE zero-fill: 0.52
RMSE mean-fill: 0.47


'With mean'

## Q4. Regularized linear regression (fill NAs with 0)
Try `r` in `[0, 0.01, 0.1, 1, 5, 10, 100]` and pick the one with the best validation RMSE (smallest `r` on ties).

In [8]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
results_q4 = {}
Xtr = prepare_X(df_train, FEATURES, impute_strategy='zero')
Xva = prepare_X(df_val, FEATURES, impute_strategy='zero')
for r in r_values:
    w0, w = train_linear_regression(Xtr, y_train, r=r)
    y_pred = predict(w0, w, Xva)
    results_q4[r] = rmse(y_val, y_pred)

for r in r_values:
    print(f'r={r}: RMSE={round(results_q4[r], 2)}')

min_rmse = min(results_q4.values())
best_rs = [r for r, s in results_q4.items() if abs(s - min_rmse) < 1e-12]
best_r = sorted(best_rs)[0]
best_r

r=0: RMSE=0.52
r=0.01: RMSE=0.52
r=0.1: RMSE=0.52
r=1: RMSE=0.52
r=5: RMSE=0.52
r=10: RMSE=0.52
r=100: RMSE=0.52


0

## Q5. Effect of random seed
For seeds 0..9, split, fill NAs with 0, train (no regularization), evaluate on validation, compute std of RMSEs.

In [9]:
rmse_scores = []
for seed in range(10):
    dtr, dva, dte = split_train_val_test(df, seed=seed)
    Xtr = prepare_X(dtr, FEATURES, impute_strategy='zero')
    ytr = dtr[TARGET].values
    Xva = prepare_X(dva, FEATURES, impute_strategy='zero')
    yva = dva[TARGET].values
    w0, w = train_linear_regression(Xtr, ytr, r=0.0)
    y_pred = predict(w0, w, Xva)
    rmse_scores.append(rmse(yva, y_pred))

std_q5 = float(np.std(rmse_scores))
rounded_std_q5 = round(std_q5, 3)
print('RMSE scores:', [round(s,2) for s in rmse_scores])
print('Std of RMSEs:', rounded_std_q5)
nearest_q5 = nearest_option(rounded_std_q5, [0.001, 0.006, 0.060, 0.600])
nearest_q5

RMSE scores: [np.float64(0.52), np.float64(0.52), np.float64(0.53), np.float64(0.52), np.float64(0.53), np.float64(0.53), np.float64(0.52), np.float64(0.51), np.float64(0.52), np.float64(0.53)]
Std of RMSEs: 0.005


0.006

## Q6. Train on train+val (seed=9) with r=0.001
Combine train and validation, fill NAs with 0, train with `r=0.001`, evaluate on test.

In [10]:
dtr, dva, dte = split_train_val_test(df, seed=9)
df_full = pd.concat([dtr, dva]).reset_index(drop=True)
X_full = prepare_X(df_full, FEATURES, impute_strategy='zero')
y_full = df_full[TARGET].values
w0, w = train_linear_regression(X_full, y_full, r=0.001)
X_te = prepare_X(dte, FEATURES, impute_strategy='zero')
y_te = dte[TARGET].values
y_pred_te = predict(w0, w, X_te)
rmse_q6 = rmse(y_te, y_pred_te)
rounded_rmse_q6 = round(rmse_q6, 3)
rounded_rmse_q6, nearest_option(rounded_rmse_q6, [0.15, 0.515, 5.15, 51.5])

(np.float64(0.505), 0.515)

## Summary of Answers
Run the following cell to print the nearest multiple-choice options based on computed values.

In [11]:
answers = {}
# Q1
answers['Q1'] = missing_feature
# Q2
answers['Q2'] = nearest_option(int(round(hp_median)), [49, 99, 149, 199])
# Q3
answers['Q3'] = better_q3
# Q4
answers['Q4'] = best_r
# Q5
answers['Q5'] = nearest_q5
# Q6
answers['Q6'] = nearest_option(rounded_rmse_q6, [0.15, 0.515, 5.15, 51.5])
answers


{'Q1': 'horsepower',
 'Q2': 149,
 'Q3': 'With mean',
 'Q4': 0,
 'Q5': 0.006,
 'Q6': 0.515}