In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import xgboost as xgb

In [3]:
data = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"

In [4]:
!wget $data

--2025-10-30 22:43:58--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-30 22:43:58 (63.0 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [4]:
df = pd.read_csv("car_fuel_efficiency.csv")
df.head(1)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729


In [5]:
df.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [6]:
df = df.fillna(0)

y = df['fuel_efficiency_mpg']
X = df.drop('fuel_efficiency_mpg', axis=1)

In [7]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

In [8]:
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')
test_dicts = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

In [9]:
print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (5822, 14)
Validation shape: (1941, 14)
Test shape: (1941, 14)


In [10]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [11]:
importances = dt.feature_importances_
feature_names = dv.get_feature_names_out()


In [12]:
sorted_features = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

sorted_features[:5]


[('vehicle_weight', np.float64(1.0)),
 ('acceleration', np.float64(0.0)),
 ('drivetrain=All-wheel drive', np.float64(0.0)),
 ('drivetrain=Front-wheel drive', np.float64(0.0)),
 ('engine_displacement', np.float64(0.0))]

In [13]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
y_pred = rf.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(rmse)


0.4602815367032659


In [25]:
scores = []

for n in tqdm(range(10, 201, 10), desc="Training models"):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))

for n, rmse in scores:
    print(f"n_estimators={n:<3}  RMSE={rmse:.3f}")

Training models: 100%|██████████| 20/20 [04:11<00:00, 12.58s/it]

n_estimators=10   RMSE=0.460
n_estimators=20   RMSE=0.446
n_estimators=30   RMSE=0.440
n_estimators=40   RMSE=0.438
n_estimators=50   RMSE=0.437
n_estimators=60   RMSE=0.436
n_estimators=70   RMSE=0.436
n_estimators=80   RMSE=0.436
n_estimators=90   RMSE=0.435
n_estimators=100  RMSE=0.435
n_estimators=110  RMSE=0.435
n_estimators=120  RMSE=0.435
n_estimators=130  RMSE=0.435
n_estimators=140  RMSE=0.435
n_estimators=150  RMSE=0.435
n_estimators=160  RMSE=0.435
n_estimators=170  RMSE=0.435
n_estimators=180  RMSE=0.435
n_estimators=190  RMSE=0.435
n_estimators=200  RMSE=0.435





In [26]:
max_depth_values = [10, 15, 20, 25]
results = {}

for depth in tqdm(max_depth_values, desc="Testing max_depth"):
    rmses = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmses.append(rmse)
    results[depth] = np.mean(rmses)
    print(f"max_depth={depth}, mean RMSE={np.mean(rmses):.3f}")

results

Testing max_depth:  25%|██▌       | 1/4 [01:50<05:31, 110.55s/it]

max_depth=10, mean RMSE=0.436


Testing max_depth:  50%|█████     | 2/4 [05:27<05:46, 173.42s/it]

max_depth=15, mean RMSE=0.438


Testing max_depth:  75%|███████▌  | 3/4 [09:36<03:27, 207.78s/it]

max_depth=20, mean RMSE=0.438


Testing max_depth: 100%|██████████| 4/4 [13:46<00:00, 206.63s/it]

max_depth=25, mean RMSE=0.438





{10: np.float64(0.43624733022811624),
 15: np.float64(0.4378245115127723),
 20: np.float64(0.43769343549884143),
 25: np.float64(0.43765343428485853)}

In [15]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)


0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()

sorted_features = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)

for name, score in sorted_features[:10]:
    print(f"{name}: {score:.4f}")

vehicle_weight: 0.9599
horsepower: 0.0159
acceleration: 0.0114
engine_displacement: 0.0032
model_year: 0.0031
num_cylinders: 0.0023
num_doors: 0.0016
origin=USA: 0.0005
origin=Asia: 0.0004
origin=Europe: 0.0004


In [20]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]

In [21]:
def train_and_eval(eta):
    params = {
        'eta': eta,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 0
    }

    model = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
    y_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

In [22]:
rmse_03 = train_and_eval(0.3)
rmse_01 = train_and_eval(0.1)

print(f"RMSE (eta=0.3): {rmse_03:.3f}")
print(f"RMSE (eta=0.1): {rmse_01:.3f}")

RMSE (eta=0.3): 0.443
RMSE (eta=0.1): 0.417
