In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-11-03 15:25:42--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-11-03 15:25:42 (18.4 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [2]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [3]:
fuel_efficiency_raw_df = pd.read_csv("car_fuel_efficiency.csv")

fuel_efficiency_raw_df.describe()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg
count,9704.0,9222.0,8996.0,9704.0,8774.0,9704.0,9202.0,9704.0
mean,199.708368,3.962481,149.657292,3001.280993,15.021928,2011.484027,-0.006412,14.985243
std,49.455319,1.999323,29.879555,497.89486,2.510339,6.659808,1.048162,2.556468
min,10.0,0.0,37.0,952.681761,6.0,2000.0,-4.0,6.200971
25%,170.0,3.0,130.0,2666.248985,13.3,2006.0,-1.0,13.267459
50%,200.0,4.0,149.0,2993.226296,15.0,2012.0,0.0,15.006037
75%,230.0,5.0,170.0,3334.957039,16.7,2017.0,1.0,16.707965
max,380.0,13.0,271.0,4739.077089,24.3,2023.0,4.0,25.967222


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

# Data preparation
fuel_efficiency_filled_df = fuel_efficiency_raw_df.fillna(0)

# dataframes splitting
full_train_df, test_df = train_test_split(fuel_efficiency_filled_df, test_size=0.2, random_state=1)
train_df, validation_df = train_test_split(full_train_df, test_size=0.25, random_state=1)

train_df.reset_index(drop=True, inplace=True)
validation_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Get targets
y_train = train_df['fuel_efficiency_mpg']
y_validation = validation_df['fuel_efficiency_mpg']
y_test = test_df['fuel_efficiency_mpg']

del train_df['fuel_efficiency_mpg']
del validation_df['fuel_efficiency_mpg']
del test_df['fuel_efficiency_mpg']

# Vectorize data
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_df.to_dict(orient="records"))
X_validation = dv.transform(validation_df.to_dict(orient="records"))
X_test = dv.transform(test_df.to_dict(orient="records"))


**Question 1:**

In [5]:
from sklearn.tree import export_text

# train model
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

print(export_text(dt, feature_names=list(dv.get_feature_names_out())))
# dv.feature_names_, list(dv.get_feature_names_out())

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



**Answer**: **vehicle_weight**

**Question 2:**

In [6]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

rf.fit(X_train, y_train)
predictions = rf.predict(X_validation)

root_mean_squared_error(y_validation, predictions)

0.4595777223092726

**Answer: 0.45**

**Question 3:**

In [None]:
statistics = []

for estimators_number in range(10, 201, 10):
  rf = RandomForestRegressor(n_estimators=estimators_number, random_state=1, n_jobs=-1)

  rf.fit(X_train, y_train)
  predictions = rf.predict(X_validation)

  statistics.append(
      {
          "estimators": estimators_number,
          "rmse": root_mean_squared_error(y_validation, predictions)
      }
  )


In [None]:
pd.DataFrame(statistics).round(3)

Unnamed: 0,estimators,rmse
0,10,0.46
1,20,0.454
2,30,0.452
3,40,0.449
4,50,0.447
5,60,0.445
6,70,0.445
7,80,0.445
8,90,0.445
9,100,0.445


**Answer: 200**

**Question 4:**

In [None]:
statistics = []

for max_depth in [10, 15, 20, 25]:
  rmses = []
  for estimators_number in range(10, 201, 10):
    rf = RandomForestRegressor(max_depth=max_depth, n_estimators=estimators_number, random_state=1, n_jobs=-1)

    rf.fit(X_train, y_train)
    predictions = rf.predict(X_validation)

    rmses.append(root_mean_squared_error(y_validation, predictions))

  statistics.append(
      {
          "max_depth": max_depth,
          "Mean RMSE": np.mean(rmses)
      }
  )


In [None]:
pd.DataFrame(statistics)

Unnamed: 0,max_depth,Mean RMSE
0,10,0.441808
1,15,0.445417
2,20,0.446253
3,25,0.44591


**Answer: 10**

**Question 5:**

In [12]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)

rf.fit(X_train, y_train)
pd.DataFrame(zip(rf.feature_importances_.round(3), dv.feature_names_))

Unnamed: 0,0,1
0,0.011,acceleration
1,0.0,drivetrain=All-wheel drive
2,0.0,drivetrain=Front-wheel drive
3,0.003,engine_displacement
4,0.0,fuel_type=Diesel
5,0.0,fuel_type=Gasoline
6,0.016,horsepower
7,0.003,model_year
8,0.002,num_cylinders
9,0.002,num_doors


**Answer: vehicle_weight**

**Question 6:**

In [14]:
import xgboost as xgb

features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_validation, label=y_validation, feature_names=features)

In [15]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dtrain, 'train'), (dval, 'val')]
model = xgb.train(xgb_params,
                  dtrain,
                  num_boost_round=100,
                  evals=watchlist)


[0]	train-rmse:1.81393	val-rmse:1.85444
[1]	train-rmse:1.31919	val-rmse:1.35353
[2]	train-rmse:0.98120	val-rmse:1.01316
[3]	train-rmse:0.75443	val-rmse:0.78667
[4]	train-rmse:0.60680	val-rmse:0.64318
[5]	train-rmse:0.51381	val-rmse:0.55664
[6]	train-rmse:0.45470	val-rmse:0.50321
[7]	train-rmse:0.41881	val-rmse:0.47254
[8]	train-rmse:0.39534	val-rmse:0.45509
[9]	train-rmse:0.38038	val-rmse:0.44564
[10]	train-rmse:0.37115	val-rmse:0.43896
[11]	train-rmse:0.36361	val-rmse:0.43594
[12]	train-rmse:0.35850	val-rmse:0.43558
[13]	train-rmse:0.35365	val-rmse:0.43394
[14]	train-rmse:0.35025	val-rmse:0.43349
[15]	train-rmse:0.34666	val-rmse:0.43362
[16]	train-rmse:0.34459	val-rmse:0.43378
[17]	train-rmse:0.34128	val-rmse:0.43405
[18]	train-rmse:0.33822	val-rmse:0.43391
[19]	train-rmse:0.33709	val-rmse:0.43374
[20]	train-rmse:0.33553	val-rmse:0.43376
[21]	train-rmse:0.33243	val-rmse:0.43453
[22]	train-rmse:0.33031	val-rmse:0.43510
[23]	train-rmse:0.32815	val-rmse:0.43601
[24]	train-rmse:0.32670	va

In [16]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dtrain, 'train'), (dval, 'val')]
model = xgb.train(xgb_params,
                  dtrain,
                  num_boost_round=100,
                  evals=watchlist)

[0]	train-rmse:2.28944	val-rmse:2.34561
[1]	train-rmse:2.07396	val-rmse:2.12434
[2]	train-rmse:1.88066	val-rmse:1.92597
[3]	train-rmse:1.70730	val-rmse:1.74987
[4]	train-rmse:1.55163	val-rmse:1.59059
[5]	train-rmse:1.41247	val-rmse:1.44988
[6]	train-rmse:1.28796	val-rmse:1.32329
[7]	train-rmse:1.17660	val-rmse:1.20930
[8]	train-rmse:1.07736	val-rmse:1.10830
[9]	train-rmse:0.98883	val-rmse:1.02009
[10]	train-rmse:0.91008	val-rmse:0.94062
[11]	train-rmse:0.84030	val-rmse:0.87100
[12]	train-rmse:0.77874	val-rmse:0.80916
[13]	train-rmse:0.72417	val-rmse:0.75465
[14]	train-rmse:0.67626	val-rmse:0.70780
[15]	train-rmse:0.63402	val-rmse:0.66672
[16]	train-rmse:0.59690	val-rmse:0.63062
[17]	train-rmse:0.56447	val-rmse:0.60016
[18]	train-rmse:0.53619	val-rmse:0.57383
[19]	train-rmse:0.51138	val-rmse:0.55044
[20]	train-rmse:0.48983	val-rmse:0.53064
[21]	train-rmse:0.47135	val-rmse:0.51451
[22]	train-rmse:0.45501	val-rmse:0.49998
[23]	train-rmse:0.44120	val-rmse:0.48790
[24]	train-rmse:0.42929	va

**0.1 - 0.424**, 0.3 - 0.44