## Preparation

In [None]:
import pandas as pd
df = pd.read_csv('carfueldata.csv')

In [12]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [13]:
df = df.fillna(0)

In [14]:
y = df.fuel_efficiency_mpg.values


In [15]:
df = df.drop(columns=['fuel_efficiency_mpg'])

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
df_full_train, df_test, y_full_train, y_test = train_test_split(df, y, test_size=0.2, random_state=1)
df_train, df_val, y_train, y_val = train_test_split(df_full_train, y_full_train, test_size=0.25, random_state=1) 


In [19]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
dv = DictVectorizer(sparse=True)

In [21]:
train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [22]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [24]:
test_dicts = df_test.to_dict(orient='records')
X_test = dv.transform(test_dicts)

## Question 1

In [26]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text

dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



Answer: vehicle_weight

## Question 2

In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [28]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)

In [29]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)

In [30]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(rmse)

0.4595777223092726


Answer: 0.45

## Question 3

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [33]:
results = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    results.append((n, rmse))

best_n, best_rmse = min(results, key=lambda x: x[1])
print("Best n_estimators:", best_n, "RMSE:", best_rmse)


Best n_estimators: 180 RMSE: 0.44236195357041347


Answer: 200 (closest one)

## Question 4

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

max_depth_list = [10, 15, 20, 25]

results = []

for d in max_depth_list:
    rmses = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=d,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmses.append(rmse)
    results.append((d, np.mean(rmses)))

best_depth, best_mean_rmse = min(results, key=lambda x: x[1])
print("best max_depth:", best_depth, "mean RMSE:", best_mean_rmse)
print(results)


best max_depth: 10 mean RMSE: 0.4418078609323356
[(10, np.float64(0.4418078609323356)), (15, np.float64(0.4454166445638107)), (20, np.float64(0.44625292424422536)), (25, np.float64(0.44590993626161624))]


Answer: 10

## Question 5

In [35]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feat_names = dv.get_feature_names_out()

# make a sorted list descending
sorted(list(zip(feat_names, importances)), key=lambda x: x[1], reverse=True)[:10]


[('vehicle_weight', np.float64(0.9591499647407432)),
 ('horsepower', np.float64(0.015997897714266237)),
 ('acceleration', np.float64(0.01147970063142936)),
 ('engine_displacement', np.float64(0.003272791913609506)),
 ('model_year', np.float64(0.0032123000947946716)),
 ('num_cylinders', np.float64(0.0023433469524512004)),
 ('num_doors', np.float64(0.001634989543930702)),
 ('origin=USA', np.float64(0.0005397216891829172)),
 ('origin=Europe', np.float64(0.000518739638586969)),
 ('origin=Asia', np.float64(0.0004622464955097426))]

Answer: vehicle_weight

## Question 6

In [37]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.get_feature_names_out().tolist())
dval   = xgb.DMatrix(X_val,   label=y_val,   feature_names=dv.get_feature_names_out().tolist())
watchlist = [(dtrain, 'train'), (dval, 'val')]

etas = [0.3, 0.1]

rmses = {}

for e in etas:
    xgb_params = {
        'eta': e,
        'max_depth': 6,
        'min_child_weight': 1,

        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        
        'nthread': 8,
        'seed': 1,
        'verbosity': 1
    }

    model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)
    y_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmses[e] = rmse
    print("eta:", e, " RMSE:", rmse)

best_eta = min(rmses, key=rmses.get)
print("\nBest ETA is:", best_eta)


[0]	train-rmse:1.81393	val-rmse:1.85444
[1]	train-rmse:1.31919	val-rmse:1.35353
[2]	train-rmse:0.98120	val-rmse:1.01316
[3]	train-rmse:0.75443	val-rmse:0.78667
[4]	train-rmse:0.60680	val-rmse:0.64318
[5]	train-rmse:0.51381	val-rmse:0.55664
[6]	train-rmse:0.45470	val-rmse:0.50321
[7]	train-rmse:0.41881	val-rmse:0.47254
[8]	train-rmse:0.39534	val-rmse:0.45509
[9]	train-rmse:0.38038	val-rmse:0.44564
[10]	train-rmse:0.37115	val-rmse:0.43896
[11]	train-rmse:0.36361	val-rmse:0.43594
[12]	train-rmse:0.35850	val-rmse:0.43558
[13]	train-rmse:0.35365	val-rmse:0.43394
[14]	train-rmse:0.35025	val-rmse:0.43349
[15]	train-rmse:0.34666	val-rmse:0.43362
[16]	train-rmse:0.34459	val-rmse:0.43378
[17]	train-rmse:0.34128	val-rmse:0.43405
[18]	train-rmse:0.33822	val-rmse:0.43391
[19]	train-rmse:0.33709	val-rmse:0.43374
[20]	train-rmse:0.33553	val-rmse:0.43376
[21]	train-rmse:0.33243	val-rmse:0.43453
[22]	train-rmse:0.33031	val-rmse:0.43510
[23]	train-rmse:0.32815	val-rmse:0.43601
[24]	train-rmse:0.32670	va

In [38]:
rmses

{0.3: np.float64(0.45017755678087246), 0.1: np.float64(0.42622800553359225)}

Answer: 0.1