# HW-6

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

### Load Dataset

In [2]:
df = pd.read_csv('car_fuel_efficiency.csv')
x = df['fuel_efficiency_mpg']
print(len(x.astype(float).unique()))

9704


### Preparing the dataset

In [3]:
"""
Fill missing values with zeros.
Do train/validation/test split with 60%/20%/20% distribution.
Use the train_test_split function and set the random_state parameter to 1.
Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
"""

# Identify columns with null values
null_cols = df.columns[df.isnull().any()]

# Replace NaN values with 0.0 in only the columns with null values
df[null_cols] = df[null_cols].fillna(0.0)

# Show the modified DataFrame
print(df.isnull().sum())

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64


In [4]:
# split dataframe in df_train, df_test, df_validation

seed = 1
d_test = 0.2
d_val = 0.2

# drop 'fuel_efficiency_mpg' col from df
df_full_train, df_test = train_test_split(df.drop(columns='fuel_efficiency_mpg'), test_size=d_test, random_state=seed)
df_train, df_val = train_test_split(df_full_train, test_size=d_val, random_state=seed)

print(f'Shape of full training set: {df_full_train.shape}')
print(f'Shape of validation set: {df_val.shape}')
print(f'Shape of test set: {df_test.shape}')
df_train

Shape of full training set: (7763, 10)
Shape of validation set: (1553, 10)
Shape of test set: (1941, 10)


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors
5761,200,6.0,0.0,2998.363227,16.3,2009,USA,Gasoline,All-wheel drive,1.0
8535,210,6.0,175.0,3597.492185,20.3,2005,Europe,Diesel,All-wheel drive,-1.0
7721,170,4.0,159.0,3833.088826,15.1,2007,Europe,Diesel,All-wheel drive,1.0
1877,90,9.0,144.0,3258.765440,14.7,2021,USA,Diesel,All-wheel drive,2.0
733,280,4.0,134.0,3516.878216,14.9,2003,Asia,Gasoline,All-wheel drive,1.0
...,...,...,...,...,...,...,...,...,...,...
6222,230,3.0,176.0,3430.993044,17.9,2022,Europe,Diesel,All-wheel drive,0.0
5999,250,4.0,180.0,3067.664350,15.7,2010,Asia,Diesel,All-wheel drive,-1.0
3318,230,2.0,182.0,3041.964593,16.7,2010,Europe,Diesel,All-wheel drive,0.0
4866,180,7.0,147.0,2453.341430,15.2,2015,Europe,Gasoline,All-wheel drive,0.0


## Question 1

In [5]:
# transform the df_train, df_validation, df_test to X_train, X_val, X_test & y_train, y_val, y_test

numerical = df_train.select_dtypes(include="number").columns.to_list()
categorical = df_train.select_dtypes(exclude="number").columns.to_list()
features = categorical + numerical

train_dict = df_train[features].to_dict(orient="records")
val_dict = df_val[features].to_dict(orient="records")
test_dict = df_test[features].to_dict(orient="records")

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(X=train_dict)
X_val = dv.transform(X=val_dict)
X_test = dv.transform(X=test_dict)

y_train = df.fuel_efficiency_mpg[df_train.index].to_numpy()
y_val = df.fuel_efficiency_mpg[df_val.index].to_numpy()
y_test = df.fuel_efficiency_mpg[df_test.index].to_numpy()

In [9]:
model = DecisionTreeRegressor(max_depth=1, random_state=1)
x = model.fit(X_train, y_train)

feature_importance = model.feature_importances_
print(feature_importance)
important_feature_index = feature_importance.argmax()
important_feature = dv.feature_names_[important_feature_index]

leaf_values = model.apply(X_train)
print(f'Leaf Values:{leaf_values}')

unique_leaf_indices = np.unique(leaf_values)

print(f'The feature used for splitting the data is: {important_feature}\n')
print('Leaf node values:')

for index in unique_leaf_indices:
    leaf_value = y_train[leaf_values == index].mean()
    count_in_leaf = np.sum(leaf_values == index)
    print(f'Value at leaf node {index}: {leaf_value:.2f}, Count: {count_in_leaf}')

print("\nFeature Importances:")
for name, importance in zip(dv.feature_names_, feature_importance):
    print(f"Feature: {name}, Importance: {importance:.2f}")

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
Leaf Values:[1 2 2 ... 2 1 1]
The feature used for splitting the data is: vehicle_weight

Leaf node values:
Value at leaf node 1: 16.87, Count: 3248
Value at leaf node 2: 12.92, Count: 2962

Feature Importances:
Feature: acceleration, Importance: 0.00
Feature: drivetrain=All-wheel drive, Importance: 0.00
Feature: drivetrain=Front-wheel drive, Importance: 0.00
Feature: engine_displacement, Importance: 0.00
Feature: fuel_type=Diesel, Importance: 0.00
Feature: fuel_type=Gasoline, Importance: 0.00
Feature: horsepower, Importance: 0.00
Feature: model_year, Importance: 0.00
Feature: num_cylinders, Importance: 0.00
Feature: num_doors, Importance: 0.00
Feature: origin=Asia, Importance: 0.00
Feature: origin=Europe, Importance: 0.00
Feature: origin=USA, Importance: 0.00
Feature: vehicle_weight, Importance: 1.00


## Question 2 

In [10]:
"""
Train a random forest regressor with these parameters:
n_estimators=10
random_state=1
n_jobs=-1 (optional - to make training faster)
What's the RMSE of this model on the validation data?
"""

rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE of the Random Forest model on validation data: {rmse:.3f}')

RMSE of the Random Forest model on validation data: 0.464


## Question 3

In [16]:
null_rmse = 1
rmse_values = []
n_estimators_values = range(10, 201, 10)

for n_estimators in n_estimators_values:
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_values.append(rmse)

    print(f'RMSE of the Random Forest model with n_estimators={n_estimators}: {rmse:.3f}')

RMSE of the Random Forest model with n_estimators=10: 0.464
RMSE of the Random Forest model with n_estimators=20: 0.453
RMSE of the Random Forest model with n_estimators=30: 0.450
RMSE of the Random Forest model with n_estimators=40: 0.446
RMSE of the Random Forest model with n_estimators=50: 0.444
RMSE of the Random Forest model with n_estimators=60: 0.443
RMSE of the Random Forest model with n_estimators=70: 0.442
RMSE of the Random Forest model with n_estimators=80: 0.442
RMSE of the Random Forest model with n_estimators=90: 0.441
RMSE of the Random Forest model with n_estimators=100: 0.441
RMSE of the Random Forest model with n_estimators=110: 0.441
RMSE of the Random Forest model with n_estimators=120: 0.440
RMSE of the Random Forest model with n_estimators=130: 0.440
RMSE of the Random Forest model with n_estimators=140: 0.440
RMSE of the Random Forest model with n_estimators=150: 0.440
RMSE of the Random Forest model with n_estimators=160: 0.440
RMSE of the Random Forest model w

## Question 4

In [31]:
"""
Let's select the best max_depth:

Try different values of max_depth: [10, 15, 20, 25]
For each of these values,
try different values of n_estimators from 10 till 200 (with step 10)
calculate the mean RMSE
Fix the random seed: random_state=1
What's the best max_depth, using the mean RMSE?
"""

max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)
mean_rmse_val = {}

for depth in max_depth_values:
    rmse_values = []
    for n_estimators in n_estimators_values:
        rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=depth, random_state=1, n_jobs=-1)
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_values.append(rmse)
        print(f'RMSE of the Random Forest model with max_depth={max_depth} and n_estimators={n_estimators}: {rmse:.3f}')
    mean_rmse_val[depth] = np.mean(rmse_values)
    
print(mean_rmse_val)

min_rmse_mean = min(mean_rmse_val.values())
min_rmse_mean_depth = [k for k, v in mean_rmse_val.items() if v == min_rmse_mean][0]
print(f'Depth: {min_rmse_mean_depth}, Mean: {min_rmse_mean}')

RMSE of the Random Forest model with max_depth=20 and n_estimators=10: 0.453
RMSE of the Random Forest model with max_depth=20 and n_estimators=20: 0.446
RMSE of the Random Forest model with max_depth=20 and n_estimators=30: 0.445
RMSE of the Random Forest model with max_depth=20 and n_estimators=40: 0.441
RMSE of the Random Forest model with max_depth=20 and n_estimators=50: 0.440
RMSE of the Random Forest model with max_depth=20 and n_estimators=60: 0.440
RMSE of the Random Forest model with max_depth=20 and n_estimators=70: 0.439
RMSE of the Random Forest model with max_depth=20 and n_estimators=80: 0.439
RMSE of the Random Forest model with max_depth=20 and n_estimators=90: 0.439
RMSE of the Random Forest model with max_depth=20 and n_estimators=100: 0.439
RMSE of the Random Forest model with max_depth=20 and n_estimators=110: 0.439
RMSE of the Random Forest model with max_depth=20 and n_estimators=120: 0.438
RMSE of the Random Forest model with max_depth=20 and n_estimators=130: 0

## Question 5

In [15]:
model = RandomForestRegressor(max_depth=20, random_state=1, n_estimators=10, n_jobs=-1)
x = model.fit(X_train, y_train)

feature_importance = model.feature_importances_
print(feature_importance)
important_feature_index = feature_importance.argmax()
important_feature = dv.feature_names_[important_feature_index]

leaf_values = model.apply(X_train)
print(leaf_values)
unique_leaf_indices = np.unique(leaf_values)

print(f'The feature used for splitting the data is: {important_feature}\n')
print('Leaf node values:')

for t, tree_leaf_values in enumerate(leaf_values.T):  # transpose to loop over trees
    unique_indices = np.unique(tree_leaf_values)
    print(f"Tree {t}:")
    for index in unique_indices:
        mask = tree_leaf_values == index
        leaf_value = y_train[mask].mean()
        count_in_leaf = mask.sum()
        #print(f"  Leaf {index}: mean={leaf_value:.2f}, count={count_in_leaf}")

print("\nFeature Importances:")
for name, importance in zip(dv.feature_names_, feature_importance):
    print(f"Feature: {name}, Importance: {importance:.4f}")

[1.13817007e-02 3.31956835e-04 3.08107338e-04 3.38012383e-03
 3.89811572e-04 3.38835821e-04 1.59046012e-02 3.25304356e-03
 2.21751325e-03 1.45495304e-03 4.83540221e-04 4.98309628e-04
 5.34126192e-04 9.59523377e-01]
[[3609 3336 3647 ... 3660 3508 3699]
 [7118 6782 7121 ... 7082 7109 6944]
 [7476 7350 7391 ... 7372 7470 7324]
 ...
 [4580 4661 4684 ... 4779 4630 4621]
 [1065  880 1060 ...  947  956 1059]
 [2634 2490 2739 ... 2660 2417 2557]]
The feature used for splitting the data is: vehicle_weight

Leaf node values:
Tree 0:
Tree 1:
Tree 2:
Tree 3:
Tree 4:
Tree 5:
Tree 6:
Tree 7:
Tree 8:
Tree 9:

Feature Importances:
Feature: acceleration, Importance: 0.0114
Feature: drivetrain=All-wheel drive, Importance: 0.0003
Feature: drivetrain=Front-wheel drive, Importance: 0.0003
Feature: engine_displacement, Importance: 0.0034
Feature: fuel_type=Diesel, Importance: 0.0004
Feature: fuel_type=Gasoline, Importance: 0.0003
Feature: horsepower, Importance: 0.0159
Feature: model_year, Importance: 0.003

In [21]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'eval')]

xgb_params = {
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

xgb_params['eta'] = 0.3
model_eta_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

val_predictions_eta_03 = model_eta_03.predict(dval)
rmse_eta_03 = np.sqrt(mean_squared_error(y_val, val_predictions_eta_03))
print(f'RMSE for eta=0.3: {rmse_eta_03:.3f}')

xgb_params['eta'] = 0.1
model_eta_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

val_predictions_eta_01 = model_eta_01.predict(dval)
rmse_eta_01 = np.sqrt(mean_squared_error(y_val, val_predictions_eta_01))
print(f'RMSE for eta=0.1: {rmse_eta_01:.3f}')

if rmse_eta_03 < rmse_eta_01:
    best_eta = 0.3
elif rmse_eta_03 > rmse_eta_01:
    best_eta = 0.1
else:
    best_eta = 'Both give equal value'

print(f'The best eta leading to the lowest RMSE score on the validation dataset is: {best_eta}')

[0]	train-rmse:1.81545	eval-rmse:1.85585
[1]	train-rmse:1.32075	eval-rmse:1.35264
[2]	train-rmse:0.98228	eval-rmse:1.00911
[3]	train-rmse:0.75512	eval-rmse:0.78019
[4]	train-rmse:0.60599	eval-rmse:0.63486
[5]	train-rmse:0.51184	eval-rmse:0.54727
[6]	train-rmse:0.45383	eval-rmse:0.49521
[7]	train-rmse:0.41731	eval-rmse:0.46612
[8]	train-rmse:0.39531	eval-rmse:0.45269
[9]	train-rmse:0.38039	eval-rmse:0.44429
[10]	train-rmse:0.37099	eval-rmse:0.43919
[11]	train-rmse:0.36433	eval-rmse:0.43682
[12]	train-rmse:0.35934	eval-rmse:0.43512
[13]	train-rmse:0.35520	eval-rmse:0.43571
[14]	train-rmse:0.35270	eval-rmse:0.43550
[15]	train-rmse:0.34911	eval-rmse:0.43573
[16]	train-rmse:0.34569	eval-rmse:0.43553
[17]	train-rmse:0.34368	eval-rmse:0.43552
[18]	train-rmse:0.34130	eval-rmse:0.43528
[19]	train-rmse:0.33786	eval-rmse:0.43606
[20]	train-rmse:0.33512	eval-rmse:0.43610
[21]	train-rmse:0.33364	eval-rmse:0.43642
[22]	train-rmse:0.33198	eval-rmse:0.43654
RMSE for eta=0.3: 0.437
[0]	train-rmse:2.291