In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb
from sklearn.tree import export_text

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
%load_ext blackcellmagic

In [None]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')
#df = fuel_effic_df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]

In [None]:
df.isna().sum()

In [None]:
df[["num_cylinders", "horsepower", "acceleration", "num_doors"]] = df[
    ["num_cylinders", "horsepower", "acceleration", "num_doors"]
].fillna(0.0)

In [None]:
df.isna().sum()

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

del df_train["fuel_efficiency_mpg"]
del df_val["fuel_efficiency_mpg"]
del df_test["fuel_efficiency_mpg"]

In [None]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [None]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [None]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))


In [None]:
dt = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
dt.fit(X_train, y_train)

In [None]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_pred = dt.predict(X_val)
rmse_val = rmse(y_val, y_pred)
rmse_val

In [None]:
np.linspace(10, 200, 10)

In [None]:
n_estimators = list(range(10, 210, 10))

In [None]:
n_estimator_rmse_dict = {}
for estimator in n_estimators:

    rf = RandomForestRegressor(n_estimators=estimator, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    rmse_val = round(rmse(y_val, y_pred), 3)
    print(f"n_estimator:{estimator}, rmse:{rmse_val}")
    n_estimator_rmse_dict[estimator] = rmse_val

In [None]:
depths = [10, 15, 20, 25]

for depth in depths:

    rmse_vals = []
    for estimator in n_estimators:

        rf = RandomForestRegressor(
            n_estimators=estimator, random_state=1, n_jobs=-1, max_depth=depth
        )
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse_val = round(rmse(y_val, y_pred), 3)
        rmse_vals.append(rmse_val)

    mean_rmse = np.mean(rmse_vals)
    print(f"max_depth:{depth}, mean rmse:{mean_rmse}")

In [None]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
importances = rf.feature_importances_

In [None]:
features = list(dv.get_feature_names_out())

In [None]:
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values('Importance', ascending=False)

In [None]:
feature_importance_df

In [None]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [None]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [None]:
y_pred = model.predict(dval)

In [None]:
error_val = rmse(y_val, y_pred)
error_val

In [None]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)

error_val = rmse(y_val, y_pred)
error_val