In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
!wget $data

--2025-11-02 20:16:14--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.2’


2025-11-02 20:16:14 (60.7 MB/s) - ‘car_fuel_efficiency.csv.2’ saved [874188/874188]



In [3]:
# Reading data
df = pd.read_csv('car_fuel_efficiency.csv')

In [4]:
# Checking head to validate that data was read
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [5]:
# DataSet Size
df.shape

(9704, 11)

In [6]:
# Checking types
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [7]:
# DataSet Preparation
# Check if any null value present
df.isnull().any()

engine_displacement    False
num_cylinders           True
horsepower              True
vehicle_weight         False
acceleration            True
model_year             False
origin                 False
fuel_type              False
drivetrain             False
num_doors               True
fuel_efficiency_mpg    False
dtype: bool

In [8]:
# Replace Nan with 0
df = df.fillna(0)

In [9]:
# Re-check unique value per column
df.isnull().any()

engine_displacement    False
num_cylinders          False
horsepower             False
vehicle_weight         False
acceleration           False
model_year             False
origin                 False
fuel_type              False
drivetrain             False
num_doors              False
fuel_efficiency_mpg    False
dtype: bool

In [10]:
#Re-check head
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,0.0,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,0.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [11]:
# Do train/validation/test split with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

len(df_full_train), len(df_train), len(df_val), len(df_test)

(7763, 5822, 1941, 1941)

In [12]:
# Reset Index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Prepare y
y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values

# Delete column converted
# train_features = df_train.drop(features, axis=1)
#val_features = df_val.drop(features, axis=1)
#test_features = df_test.drop(features, axis=1)
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [14]:
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')

In [15]:
# DictVectorizer(sparse=True) to turn the dataframes into matrices
dv = DictVectorizer(sparse=True)

In [16]:
# Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)
X_test = dv.transform(test_dict)

model = DecisionTreeRegressor(max_depth=1, random_state=1)
model.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [17]:
feature_names = dv.get_feature_names_out()
feature_names.tolist()

['acceleration',
 'drivetrain=All-wheel drive',
 'drivetrain=Front-wheel drive',
 'engine_displacement',
 'fuel_type=Diesel',
 'fuel_type=Gasoline',
 'horsepower',
 'model_year',
 'num_cylinders',
 'num_doors',
 'origin=Asia',
 'origin=Europe',
 'origin=USA',
 'vehicle_weight']

In [18]:
importances = model.feature_importances_
importances

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [19]:
# Question 1 - Feature Importance
# Get the feature used for splitting

importances = model.feature_importances_
for i, importance in enumerate(importances):
    print(f"Feature {feature_names[i]}: {importance: .4f}")

Feature acceleration:  0.0000
Feature drivetrain=All-wheel drive:  0.0000
Feature drivetrain=Front-wheel drive:  0.0000
Feature engine_displacement:  0.0000
Feature fuel_type=Diesel:  0.0000
Feature fuel_type=Gasoline:  0.0000
Feature horsepower:  0.0000
Feature model_year:  0.0000
Feature num_cylinders:  0.0000
Feature num_doors:  0.0000
Feature origin=Asia:  0.0000
Feature origin=Europe:  0.0000
Feature origin=USA:  0.0000
Feature vehicle_weight:  1.0000


In [21]:
# Question 2
# Train a random forest regressor with these parameters: n_estimators=10, random_state=1, n_jobs=-1 (optional - to make training faster)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
rmse

np.float64(0.4595777223092726)

In [23]:
# Question 3
#Now let's experiment with the n_estimators parameter
#Try different values of this parameter from 10 to 200 with step 10.
# Set random_state to 1.
# Evaluate the model on the validation dataset.

rmse_values = []

for n in range(10, 201, 10):
    rf_model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf_model.fit(X_train, y_train)

    y_pred = rf_model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.round(np.sqrt(mse),3)
    rmse_values.append(rmse)
    print(f'n:{n},RMSE={rmse}')

n:10,RMSE=0.46
n:20,RMSE=0.454
n:30,RMSE=0.452
n:40,RMSE=0.449
n:50,RMSE=0.447
n:60,RMSE=0.445
n:70,RMSE=0.445
n:80,RMSE=0.445
n:90,RMSE=0.445
n:100,RMSE=0.445
n:110,RMSE=0.444
n:120,RMSE=0.444
n:130,RMSE=0.444
n:140,RMSE=0.443
n:150,RMSE=0.443
n:160,RMSE=0.443
n:170,RMSE=0.443
n:180,RMSE=0.442
n:190,RMSE=0.442
n:200,RMSE=0.442


In [29]:
rmse_values

# best rmse
best_rmse = min(rmse_values)
best_rmse

np.float64(0.442)

In [34]:
# Question 4
scores_max_depth = []

for d in [10, 15, 20, 25]:
    rmse_depth_value = []
    for n in range(10, 200, 10):
        rf_model = RandomForestRegressor(max_depth=d, n_estimators=n, random_state=1, n_jobs=-1)
        rf_model.fit(X_train, y_train)

        y_pred = rf_model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        rmse_depth_value.append(np.sqrt(mse))   
    scores_max_depth.append((d, np.mean(rmse_depth_value)))

scores_max_depth

[(10, np.float64(0.44191116381008944)),
 (15, np.float64(0.4455782792875011)),
 (20, np.float64(0.44642913145719837)),
 (25, np.float64(0.44609049024610387))]

In [35]:
# calculate 
max_depth = []
max_depth.append(0.44191116381008944);
max_depth.append(0.4455782792875011);
max_depth.append(0.44642913145719837);
max_depth.append(0.44609049024610387);
max_depth

[0.44191116381008944,
 0.4455782792875011,
 0.44642913145719837,
 0.44609049024610387]

In [36]:
min(max_depth)

0.44191116381008944

In [39]:
# Question 5
rf_model = RandomForestRegressor(max_depth=20,n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)
feature_importance = rf_model.feature_importances_
feature_importance

array([1.14797006e-02, 3.57108549e-04, 3.45384113e-04, 3.27279191e-03,
       3.25424323e-04, 3.60383601e-04, 1.59978977e-02, 3.21230009e-03,
       2.34334695e-03, 1.63498954e-03, 4.62246496e-04, 5.18739639e-04,
       5.39721689e-04, 9.59149965e-01])

In [40]:
for i, importance in enumerate(feature_importance):
    print(f"Feature {feature_names[i]}: {importance: .4f}")

Feature acceleration:  0.0115
Feature drivetrain=All-wheel drive:  0.0004
Feature drivetrain=Front-wheel drive:  0.0003
Feature engine_displacement:  0.0033
Feature fuel_type=Diesel:  0.0003
Feature fuel_type=Gasoline:  0.0004
Feature horsepower:  0.0160
Feature model_year:  0.0032
Feature num_cylinders:  0.0023
Feature num_doors:  0.0016
Feature origin=Asia:  0.0005
Feature origin=Europe:  0.0005
Feature origin=USA:  0.0005
Feature vehicle_weight:  0.9591


In [41]:
# Question 6
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m31.9 MB/s[0m  [33m0:00:03[0m[0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m37.7 MB/s[0m  [33m0:00:07[0m[0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [xgboost]━━━[0m [32m1/2[0m [xgboost]
[1A[2KSuccessfully installed nvidia-nccl-cu12-2.28.7 xgboost-3.1.1


In [42]:
import xgboost as xgb

In [43]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [45]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dval, 'eval'), (dtrain, 'train')]
num_round = 100

In [49]:
model = xgb.train(xgb_params, dtrain, num_round, evals=watchlist, early_stopping_rounds=10)
        
y_val_pred = model.predict(dval)
rmse_eta = np.sqrt(mean_squared_error(y_val, y_val_pred))
rmse_eta

[0]	eval-rmse:1.85444	train-rmse:1.81393
[1]	eval-rmse:1.35353	train-rmse:1.31919
[2]	eval-rmse:1.01316	train-rmse:0.98120
[3]	eval-rmse:0.78667	train-rmse:0.75443
[4]	eval-rmse:0.64318	train-rmse:0.60680
[5]	eval-rmse:0.55664	train-rmse:0.51381
[6]	eval-rmse:0.50321	train-rmse:0.45470
[7]	eval-rmse:0.47254	train-rmse:0.41881
[8]	eval-rmse:0.45509	train-rmse:0.39534
[9]	eval-rmse:0.44564	train-rmse:0.38038
[10]	eval-rmse:0.43896	train-rmse:0.37115
[11]	eval-rmse:0.43594	train-rmse:0.36361
[12]	eval-rmse:0.43558	train-rmse:0.35850
[13]	eval-rmse:0.43394	train-rmse:0.35365
[14]	eval-rmse:0.43349	train-rmse:0.35025
[15]	eval-rmse:0.43362	train-rmse:0.34666
[16]	eval-rmse:0.43378	train-rmse:0.34459
[17]	eval-rmse:0.43405	train-rmse:0.34128
[18]	eval-rmse:0.43391	train-rmse:0.33822
[19]	eval-rmse:0.43374	train-rmse:0.33709
[20]	eval-rmse:0.43376	train-rmse:0.33553
[21]	eval-rmse:0.43453	train-rmse:0.33243
[22]	eval-rmse:0.43510	train-rmse:0.33031
[23]	eval-rmse:0.43601	train-rmse:0.32815
[2

np.float64(0.45017755678087246)

In [50]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [51]:
model = xgb.train(xgb_params, dtrain, num_round, evals=watchlist, early_stopping_rounds=10)
        
y_val_pred = model.predict(dval)
rmse_eta = np.sqrt(mean_squared_error(y_val, y_val_pred))
rmse_eta

[0]	eval-rmse:2.34561	train-rmse:2.28944
[1]	eval-rmse:2.12434	train-rmse:2.07396
[2]	eval-rmse:1.92597	train-rmse:1.88066
[3]	eval-rmse:1.74987	train-rmse:1.70730
[4]	eval-rmse:1.59059	train-rmse:1.55163
[5]	eval-rmse:1.44988	train-rmse:1.41247
[6]	eval-rmse:1.32329	train-rmse:1.28796
[7]	eval-rmse:1.20930	train-rmse:1.17660
[8]	eval-rmse:1.10830	train-rmse:1.07736
[9]	eval-rmse:1.02009	train-rmse:0.98883
[10]	eval-rmse:0.94062	train-rmse:0.91008
[11]	eval-rmse:0.87100	train-rmse:0.84030
[12]	eval-rmse:0.80916	train-rmse:0.77874
[13]	eval-rmse:0.75465	train-rmse:0.72417
[14]	eval-rmse:0.70780	train-rmse:0.67626
[15]	eval-rmse:0.66672	train-rmse:0.63402
[16]	eval-rmse:0.63062	train-rmse:0.59690
[17]	eval-rmse:0.60016	train-rmse:0.56447
[18]	eval-rmse:0.57383	train-rmse:0.53619
[19]	eval-rmse:0.55044	train-rmse:0.51138
[20]	eval-rmse:0.53064	train-rmse:0.48983
[21]	eval-rmse:0.51451	train-rmse:0.47135
[22]	eval-rmse:0.49998	train-rmse:0.45501
[23]	eval-rmse:0.48790	train-rmse:0.44120
[2

np.float64(0.42622800553359225)