In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [3]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [4]:
numerical = ["engine_displacement","num_cylinders","horsepower","vehicle_weight","acceleration","model_year","num_doors","fuel_efficiency_mpg"]
categorical = ["origin","fuel_type","drivetrain"]

In [5]:
for n in numerical :
    df[n] = df[n].fillna(0)
for c in categorical:
    df[c] = df[c].fillna('NA')

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [8]:
len(df_train), len(df_val), len(df_test)

(5822, 1941, 1941)

In [9]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

In [10]:
y_train = df_train['fuel_efficiency_mpg']
y_val = df_val['fuel_efficiency_mpg']
y_test = df_test['fuel_efficiency_mpg']

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [11]:
y_train.head()

0    15.301475
1    15.331215
2    15.336679
3    15.865850
4    18.102203
Name: fuel_efficiency_mpg, dtype: float64

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [13]:
# View tree structure and feature splits
tree_rules = export_text(dt, feature_names=dv.get_feature_names_out().tolist())
print(tree_rules)

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(n_estimators=10,
                           random_state=1,
                           n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = rf.predict(X_val)

In [18]:
from sklearn.metrics import root_mean_squared_error
print (root_mean_squared_error(y_val, y_pred))

0.4599777557336148


In [19]:
rmse = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n,
                               random_state=1,
                               n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    res = root_mean_squared_error(y_val, y_pred)
    print (f"n_estimators: {n}  RMSE: {res}")
    rmse.append(res)

n_estimators: 10  RMSE: 0.4599777557336148
n_estimators: 20  RMSE: 0.45365906507838477
n_estimators: 30  RMSE: 0.45074274602790043
n_estimators: 40  RMSE: 0.4480067936304668
n_estimators: 50  RMSE: 0.44615128055079933
n_estimators: 60  RMSE: 0.4452658337959235
n_estimators: 70  RMSE: 0.4446098249138531
n_estimators: 80  RMSE: 0.4448931980390689
n_estimators: 90  RMSE: 0.4447241129599526
n_estimators: 100  RMSE: 0.4443178455925074
n_estimators: 110  RMSE: 0.44313500906534525
n_estimators: 120  RMSE: 0.4435285723898764
n_estimators: 130  RMSE: 0.44336417807088435
n_estimators: 140  RMSE: 0.4431801001185647
n_estimators: 150  RMSE: 0.442909875717056
n_estimators: 160  RMSE: 0.4426293654180784
n_estimators: 170  RMSE: 0.4427157028806333
n_estimators: 180  RMSE: 0.4423616814462061
n_estimators: 190  RMSE: 0.4425785032007027
n_estimators: 200  RMSE: 0.442606853652302


In [21]:
for depth in [10,15,20,25]:
    rmse = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n,
                                random_state=1,
                                n_jobs=-1,
                                max_depth=depth)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        res = root_mean_squared_error(y_val, y_pred)
        print (f"depth: {depth} n_estimators: {n}  RMSE: {res}")
        rmse.append(res)

depth: 10 n_estimators: 10  RMSE: 0.45189498205368783
depth: 10 n_estimators: 20  RMSE: 0.44871930280196803
depth: 10 n_estimators: 30  RMSE: 0.4462248243410706
depth: 10 n_estimators: 40  RMSE: 0.44387694693462976
depth: 10 n_estimators: 50  RMSE: 0.44268232973644583
depth: 10 n_estimators: 60  RMSE: 0.44234995527764454
depth: 10 n_estimators: 70  RMSE: 0.44179313949076043
depth: 10 n_estimators: 80  RMSE: 0.44195965762740047
depth: 10 n_estimators: 90  RMSE: 0.4419668100600304
depth: 10 n_estimators: 100  RMSE: 0.44177674543409867
depth: 10 n_estimators: 110  RMSE: 0.4408469921594633
depth: 10 n_estimators: 120  RMSE: 0.44096669954703027
depth: 10 n_estimators: 130  RMSE: 0.44083255224366663
depth: 10 n_estimators: 140  RMSE: 0.4405115320481308
depth: 10 n_estimators: 150  RMSE: 0.440127581955975
depth: 10 n_estimators: 160  RMSE: 0.43999675953472894
depth: 10 n_estimators: 170  RMSE: 0.4401374684770232
depth: 10 n_estimators: 180  RMSE: 0.4398370341427043
depth: 10 n_estimators: 190

In [27]:
rf = RandomForestRegressor(n_estimators=10,
                                random_state=1,
                                n_jobs=-1,
                                max_depth=20)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()
forest_importances = pd.Series(importances, index=feature_names)
print(forest_importances.sort_values(ascending=False))

vehicle_weight                  0.959162
horsepower                      0.016040
acceleration                    0.011471
engine_displacement             0.003269
model_year                      0.003182
num_cylinders                   0.002359
num_doors                       0.001591
origin=USA                      0.000555
origin=Europe                   0.000520
origin=Asia                     0.000476
drivetrain=All-wheel drive      0.000382
fuel_type=Diesel                0.000344
fuel_type=Gasoline              0.000337
drivetrain=Front-wheel drive    0.000312
dtype: float64


In [28]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.2-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.28.9-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [xgboost]m1/2[0m [xgboost]
[1A[2KSuccessfully installed nvidia-nccl-cu12-2.28.9 xgboost-3.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49

In [29]:
import xgboost as xgb

In [30]:

features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [37]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [38]:
y_pred = model.predict(dval)
res = root_mean_squared_error(y_val, y_pred)
print (f"eta: {0.3} RMSE: {res}")

eta: 0.3 RMSE: 0.44563950318601825


In [39]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [40]:
y_pred = model.predict(dval)
res = root_mean_squared_error(y_val, y_pred)
print (f"eta: {0.1} RMSE: {res}")

eta: 0.1 RMSE: 1.0200885118810736
