### 6-Trees

In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-28 22:26:24--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-28 22:26:24 (63.2 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



### Dataset preperation

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [7]:
df = pd.read_csv('car_fuel_efficiency.csv')

In [8]:
# Fill missing values with 0s
for col in ['num_cylinders', 'horsepower', 'acceleration', 'num_doors']:
    df[col] = df[col].fillna(0.0)

In [9]:
y = df['fuel_efficiency_mpg'].values
X = df.drop('fuel_efficiency_mpg', axis=1)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state =1)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state =1)

### Feature Splitting

In [32]:
dec_tree_pipeline = Pipeline([
    ('onehot_encoder', ColumnTransformer([
        ('categories', OneHotEncoder(dtype='int'), make_column_selector(dtype_include=object)),
        
    ],
    remainder='passthrough'
    )),
    ('decision_tree_reg', DecisionTreeRegressor(max_depth=1))
])

In [33]:
dec_tree_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('onehot_encoder', ...), ('decision_tree_reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categories', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,'int'
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [34]:
feature_importance_idx = dec_tree_pipeline.named_steps['decision_tree_reg'].feature_importances_ 

feature_names = dec_tree_pipeline.named_steps['onehot_encoder'].get_feature_names_out()

In [35]:
feature_names[feature_importance_idx.argmax()]

'remainder__vehicle_weight'

### Random Forest Regressor

In [37]:
rand_forest_pipeline = Pipeline([
    ('onehot_encoder', ColumnTransformer([
        ('categories', OneHotEncoder(dtype='int'), make_column_selector(dtype_include=object)),
        
    ],
    remainder='passthrough'
    )),
    ('rand_forest_reg', RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1))
])

In [38]:
rand_forest_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('onehot_encoder', ...), ('rand_forest_reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categories', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,'int'
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [40]:
y_pred = rand_forest_pipeline.predict(X_val)

In [42]:
root_mean_squared_error(y_val, y_pred)

0.4592334896096372

### Gradient Descent Simple Implementation

In [46]:
rmse_dict = {}

for n_est_param in range(10, 201, 10):
    rand_forest_pipeline = Pipeline([
    ('onehot_encoder', ColumnTransformer([
        ('categories', OneHotEncoder(dtype='int'), make_column_selector(dtype_include=object)),
        
    ],
    remainder='passthrough'
    )),
    ('rand_forest_reg', RandomForestRegressor(n_estimators=n_est_param, random_state=1, n_jobs=-1))
])
    
    rand_forest_pipeline.fit(X_train, y_train)

    y_pred = rand_forest_pipeline.predict(X_val)

    rmse_dict[n_est_param] = root_mean_squared_error(y_val, y_pred)

     

In [48]:
sorted(rmse_dict.items(), key=lambda x: round(x[1], 3))

[(170, 0.443494148365732),
 (180, 0.443040122347796),
 (190, 0.4431584838787331),
 (200, 0.4430598044340546),
 (110, 0.44427781391823684),
 (130, 0.4441881530626989),
 (140, 0.44397827647714116),
 (150, 0.4436776130438252),
 (160, 0.4435031315719394),
 (70, 0.44508052986143365),
 (100, 0.4452827522610761),
 (120, 0.44453393129328417),
 (50, 0.4464773514450392),
 (60, 0.44571174346973735),
 (80, 0.44553583373405475),
 (90, 0.4455386345710405),
 (40, 0.44834305749472547),
 (30, 0.45155643651425387),
 (20, 0.4531541185179781),
 (10, 0.4592334896096372)]

### Optimal Max Depth

In [50]:
rmse_max_depth_dict = {}

for max_depth_param in [10, 15, 20, 25]:
    rmse_n_est_lst = []

    for n_est_param in range(10, 201, 10):
        rand_forest_pipeline = Pipeline([
        ('onehot_encoder', ColumnTransformer([
            ('categories', OneHotEncoder(dtype='int'), make_column_selector(dtype_include=object)),
            
        ],
        remainder='passthrough'
        )),
        ('rand_forest_reg', RandomForestRegressor(n_estimators=n_est_param, random_state=1, n_jobs=-1, max_depth=max_depth_param))
    ])
        
        rand_forest_pipeline.fit(X_train, y_train)

        y_pred = rand_forest_pipeline.predict(X_val)

        rmse_n_est_lst.append(root_mean_squared_error(y_val, y_pred))
    rmse_max_depth_dict[max_depth_param] = np.mean(rmse_n_est_lst)
    

In [52]:
sorted(rmse_max_depth_dict.items(), key=lambda x: round(x[1], 3))

[(10, np.float64(0.4418292386286435)),
 (15, np.float64(0.44554135740761397)),
 (20, np.float64(0.44554501102614674)),
 (25, np.float64(0.446119702543161))]

#### Feature Importance

In [53]:
rand_forest_pipeline = Pipeline([
        ('onehot_encoder', ColumnTransformer([
            ('categories', OneHotEncoder(dtype='int'), make_column_selector(dtype_include=object)),
            
        ],
        remainder='passthrough'
        )),
        ('rand_forest_reg', RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1, max_depth=20))
    ])

In [54]:
rand_forest_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('onehot_encoder', ...), ('rand_forest_reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categories', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,'int'
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [57]:
feature_importance_idx = rand_forest_pipeline.named_steps['rand_forest_reg'].feature_importances_ 

feature_names = rand_forest_pipeline.named_steps['onehot_encoder'].get_feature_names_out()

In [58]:
feature_names[feature_importance_idx.argmax()]

'remainder__vehicle_weight'

### XGBoost

In [4]:
import xgboost as xgb

In [16]:
one_hot_encoder = Pipeline([
        ('onehot_encoder', ColumnTransformer([
            ('categories', OneHotEncoder(dtype='int'), make_column_selector(dtype_include=object)),
            
        ],
        remainder='passthrough'
        ))
    ])

In [23]:
X_train_hot_encoded = one_hot_encoder.fit_transform(X_train)
X_val_hot_encoded = one_hot_encoder.fit_transform(X_val)

In [27]:
type(one_hot_encoder.named_steps['onehot_encoder'].get_feature_names_out().tolist())

list

In [29]:
dtrain = xgb.DMatrix(X_train_hot_encoded, label=y_train, feature_names=one_hot_encoder.named_steps['onehot_encoder'].get_feature_names_out().tolist())
dval = xgb.DMatrix(X_val_hot_encoded, label=y_val, feature_names=one_hot_encoder.named_steps['onehot_encoder'].get_feature_names_out().tolist())

In [31]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [32]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-rmse:1.81393	val-rmse:1.85444
[5]	train-rmse:0.51381	val-rmse:0.55664
[10]	train-rmse:0.37115	val-rmse:0.43892
[15]	train-rmse:0.34666	val-rmse:0.43384
[20]	train-rmse:0.33553	val-rmse:0.43339
[25]	train-rmse:0.32268	val-rmse:0.43603
[30]	train-rmse:0.31475	val-rmse:0.43663
[35]	train-rmse:0.30960	val-rmse:0.43711
[40]	train-rmse:0.30202	val-rmse:0.43880
[45]	train-rmse:0.29126	val-rmse:0.43955
[50]	train-rmse:0.28456	val-rmse:0.44058
[55]	train-rmse:0.27618	val-rmse:0.44129
[60]	train-rmse:0.26768	val-rmse:0.44185
[65]	train-rmse:0.26174	val-rmse:0.44241
[70]	train-rmse:0.25489	val-rmse:0.44425
[75]	train-rmse:0.24792	val-rmse:0.44519
[80]	train-rmse:0.24254	val-rmse:0.44590
[85]	train-rmse:0.23644	val-rmse:0.44659
[90]	train-rmse:0.23193	val-rmse:0.44745
[95]	train-rmse:0.22475	val-rmse:0.44806
[99]	train-rmse:0.21950	val-rmse:0.44924


In [33]:
y_pred = model.predict(dval)

In [36]:
rmse_0_3 = root_mean_squared_error(y_val, y_pred)

In [37]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-rmse:2.28944	val-rmse:2.34561
[5]	train-rmse:1.41247	val-rmse:1.44988
[10]	train-rmse:0.91008	val-rmse:0.94062
[15]	train-rmse:0.63402	val-rmse:0.66680
[20]	train-rmse:0.48983	val-rmse:0.53075
[25]	train-rmse:0.41881	val-rmse:0.46901
[30]	train-rmse:0.38342	val-rmse:0.44301
[35]	train-rmse:0.36435	val-rmse:0.43263
[40]	train-rmse:0.35343	val-rmse:0.42756
[45]	train-rmse:0.34621	val-rmse:0.42603
[50]	train-rmse:0.33998	val-rmse:0.42515
[55]	train-rmse:0.33480	val-rmse:0.42474
[60]	train-rmse:0.33054	val-rmse:0.42471
[65]	train-rmse:0.32602	val-rmse:0.42503
[70]	train-rmse:0.32202	val-rmse:0.42511
[75]	train-rmse:0.31895	val-rmse:0.42540
[80]	train-rmse:0.31667	val-rmse:0.42578
[85]	train-rmse:0.31440	val-rmse:0.42589
[90]	train-rmse:0.31059	val-rmse:0.42606
[95]	train-rmse:0.30625	val-rmse:0.42630
[99]	train-rmse:0.30419	val-rmse:0.42654


In [38]:
y_pred = model.predict(dval)

In [39]:
rmse_0_1 = root_mean_squared_error(y_val, y_pred)

In [40]:
rmse_0_3, rmse_0_1

(0.4492355283431923, 0.4265359960863939)