# Car Fuel Efficiency — Homework

This notebook reproduces the experiments from `hw_regression.py`: loading the dataset, EDA, splits, training linear models with different imputations and regularization, and reporting RMSEs.

In [1]:
import os
import urllib.request
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error



In [3]:
URL = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
CSV_PATH = "car_fuel_efficiency.csv"

# Eğer dosya yoksa indir
if not os.path.exists(CSV_PATH):
    print("Downloading dataset...")
    urllib.request.urlretrieve(URL, CSV_PATH)
else:
    print("Dataset already exists.")

Dataset already exists.


In [4]:
def load_data():
    df = pd.read_csv(CSV_PATH)
    cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
    df = df[cols]
    return df

df = load_data()
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [5]:
missing = df.isnull().sum()
print("Eksik değer sayıları:\n", missing)

Eksik değer sayıları:
 engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64


In [6]:
median_hp = df['horsepower'].median()
print("Median horsepower:", median_hp)

Median horsepower: 149.0


In [7]:
def shuffle_split(df, seed=42):
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    n = len(df)
    n_train = int(n * 0.6)
    n_val = int(n * 0.2)
    train = df.iloc[:n_train].reset_index(drop=True)
    val = df.iloc[n_train:n_train+n_val].reset_index(drop=True)
    test = df.iloc[n_train+n_val:].reset_index(drop=True)
    return train, val, test

train, val, test = shuffle_split(df, seed=42)

len(train), len(val), len(test)


(5822, 1940, 1942)

In [8]:
def prepare_X(df, fill_method, mean_val=None):
    df = df.copy()
    if fill_method == 'zero':
        df['horsepower'] = df['horsepower'].fillna(0)
    elif fill_method == 'mean':
        assert mean_val is not None
        df['horsepower'] = df['horsepower'].fillna(mean_val)
    X = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']].values
    y = df['fuel_efficiency_mpg'].values
    return X, y


In [9]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)


In [10]:
def train_linear(X_train, y_train, r=0.0):
    if r == 0:
        model = LinearRegression()
    else:
        model = Ridge(alpha=r, solver='auto')
    model.fit(X_train, y_train)
    return model


In [11]:
mean_hp = train['horsepower'].mean()

X_train0, y_train0 = prepare_X(train, 'zero')
X_val0, y_val0 = prepare_X(val, 'zero')
model0 = train_linear(X_train0, y_train0, r=0.0)
pred0 = model0.predict(X_val0)
rmse0 = round(rmse(y_val0, pred0), 2)

X_train_mean, y_train_mean = prepare_X(train, 'mean', mean_val=mean_hp)
X_val_mean, y_val_mean = prepare_X(val, 'mean', mean_val=mean_hp)
model_mean = train_linear(X_train_mean, y_train_mean, r=0.0)
pred_mean = model_mean.predict(X_val_mean)
rmse_mean = round(rmse(y_val_mean, pred_mean), 2)

print(f"Q3 RMSE with 0: {rmse0}, with mean: {rmse_mean}")


Q3 RMSE with 0: 0.52, with mean: 0.46




In [12]:
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
results = {}

for r in r_list:
    X_train_r, y_train_r = prepare_X(train, 'zero')
    X_val_r, y_val_r = prepare_X(val, 'zero')
    model_r = train_linear(X_train_r, y_train_r, r=r)
    pred_r = model_r.predict(X_val_r)
    results[r] = round(rmse(y_val_r, pred_r), 2)

print("Q4 RMSEs:")
for r, s in results.items():
    print(r, s)

best_r = min(results.items(), key=lambda x: (x[1], x[0]))[0]
print("Q4 best r:", best_r)


Q4 RMSEs:
0 0.52
0.01 0.52
0.1 0.52
1 0.52
5 0.52
10 0.52
100 0.52
Q4 best r: 0




In [13]:
seeds = list(range(10))
scores = []

for s in seeds:
    tr, va, te = shuffle_split(df, seed=s)
    X_tr, y_tr = prepare_X(tr, 'zero')
    X_va, y_va = prepare_X(va, 'zero')
    m = train_linear(X_tr, y_tr, r=0.0)
    p = m.predict(X_va)
    scores.append(rmse(y_va, p))

std = round(float(np.std(scores)), 3)
print("Q5 std:", std)


Q5 std: 0.007




In [14]:
tr, va, te = shuffle_split(df, seed=9)
trainval = pd.concat([tr, va]).reset_index(drop=True)
X_tv, y_tv = prepare_X(trainval, 'zero')
X_test, y_test = prepare_X(te, 'zero')

m = train_linear(X_tv, y_tv, r=0.001)
p_test = m.predict(X_test)
rmse_test = round(rmse(y_test, p_test), 3)

print("Q6 RMSE test:", rmse_test)


Q6 RMSE test: 0.515


