In [1]:
import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv('../data/train80.csv')

In [3]:
df

Unnamed: 0,VIN (1-10),County,City,State,Postal Code,Model Year,Make,Model,Electric Vehicle Type,Clean Alternative Fuel Vehicle (CAFV) Eligibility,Electric Range,Base MSRP,Legislative District,DOL Vehicle ID,Vehicle Location,Electric Utility,2020 Census Tract
0,2523,16,27,0,98005.0,2014,41,128,0,0,103.0,0.0,41.0,186450183,261,73,5.303302e+10
1,4230,16,35,0,98011.0,2019,39,97,0,0,220.0,0.0,1.0,478093654,278,73,5.303302e+10
2,3946,33,298,0,98502.0,2025,5,163,1,0,40.0,0.0,35.0,274800718,501,72,5.306701e+10
3,6903,33,298,0,98513.0,2024,41,129,1,0,42.0,0.0,2.0,260758165,480,72,5.306701e+10
4,5280,38,382,0,98942.0,2021,39,100,0,1,0.0,0.0,15.0,236581355,178,64,5.307700e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231710,4111,16,351,0,98058.0,2019,39,97,0,0,220.0,0.0,47.0,262629839,248,73,5.303303e+10
231711,1539,30,395,0,98290.0,2024,15,74,0,1,0.0,0.0,39.0,261119043,250,72,5.306105e+10
231712,5974,16,350,0,98033.0,2024,39,100,0,1,0.0,0.0,45.0,261726258,280,73,5.303302e+10
231713,7041,3,214,0,98826.0,2023,41,129,1,0,42.0,0.0,12.0,236639826,184,66,5.300796e+10


In [4]:
from sklearn.model_selection import train_test_split


In [5]:
columns_to_drop = [
    "Legislative District", 
    "Vehicle Location", 
    "Postal Code", 
    "City", 
    "2020 Census Tract", 
    "County", 
    "Electric Utility"
]

# Remove irrelevant columns
df = df.drop(columns=columns_to_drop)

# Filter out records where Electric Range is 0
df = df[df["Electric Range"] > 0]

# Define features and target
X = df.drop(columns=["Electric Range"]).fillna(df.mean(numeric_only=True))
y = df["Electric Range"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes to verify the split
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (76082, 9)
X_test shape: (19021, 9)
y_train shape: (76082,)
y_test shape: (19021,)


In [6]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(X_train, y_train, verbose=False)

In [7]:
predictions = my_model.predict(X_test)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error : 0.13271688684217123


In [8]:
from sklearn.metrics import mean_squared_error
print("Mean Absolute Error : " + str(mean_squared_error(predictions, y_test)))

Mean Absolute Error : 0.41670959148605025


In [12]:
reg = XGBRegressor(random_state=0, booster='gbtree', objective='reg:squarederror', tree_method = "hist", device = "cuda",)

In [13]:
from skopt.space import Real, Integer

search_spaces = {'learning_rate': Real(0.01, 1.0, 'uniform'),
                 'max_depth': Integer(2, 12),
                 'subsample': Real(0.1, 1.0, 'uniform'),
                 'colsample_bytree': Real(0.1, 1.0, 'uniform'), # subsample ratio of columns by tree
                 'reg_lambda': Real(1e-9, 100., 'uniform'), # L2 regularization
                 'reg_alpha': Real(1e-9, 100., 'uniform'), # L1 regularization
                 'n_estimators': Integer(50, 5000)
   }

In [14]:
from skopt import BayesSearchCV

In [15]:
# Wrapping everything up into the Bayesian optimizer
opt = BayesSearchCV(estimator=reg,                                    
                    search_spaces=search_spaces,                      
                    scoring="neg_mean_absolute_error",                                  
                    cv=10,                                           
                    n_iter=120,                                       # max number of trials
                    n_points=1,                                       # number of hyperparameter sets evaluated at the same time
                    n_jobs=1,                                         # number of jobs
                    iid=False,                                        # if not iid it optimizes on the cv score
                    return_train_score=False,                         
                    refit=False,                                      
                    optimizer_kwargs={'base_estimator': 'GP'},        # optmizer parameters: we use Gaussian Process (GP)
                    random_state=0)                                   # random state for replicability



In [16]:
opt.fit(X_train, y_train) # fit the optimizer to the training data
print("Best hyperparameters found: ", opt.best_params_)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Best hyperparameters found:  OrderedDict([('colsample_bytree', 1.0), ('learning_rate', 0.660977898839114), ('max_depth', 7), ('n_estimators', 5000), ('reg_alpha', 1e-09), ('reg_lambda', 1e-09), ('subsample', 1.0)])


In [17]:
opt.best_score_

-0.07989317732505594

In [18]:
best_reg = XGBRegressor(
    random_state=0, 
    booster='gbtree', 
    objective='reg:squarederror', 
    tree_method = "hist", 
    device = "cuda",
    colsample_bytree=1.0,
    learning_rate=0.660977898839114,
    max_depth=7,
    n_estimators=5000,
    reg_alpha=1e-09,
    reg_lambda=1e-09,
    subsample=1.0
)

In [19]:
best_reg.fit(X_train, y_train)

In [20]:
predictions = best_reg.predict(X_test)

In [22]:
from sklearn.metrics import r2_score, explained_variance_score
import numpy as np

print("Mean Squared Error : " + str(mean_squared_error(predictions, y_test)))
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))


rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Root Mean Squared Error: {rmse:.4f}")

mask = y_test != 0
if mask.any():
    mape = np.mean(np.abs((y_test[mask] - predictions[mask]) / y_test[mask])) * 100
    print(f"Mean Absolute Percentage Error: {mape:.2f}%")
else:
    print("Mean Absolute Percentage Error: Cannot calculate (division by zero)")


Mean Squared Error : 0.5132475831405929
Mean Absolute Error : 0.0737867736896627
Root Mean Squared Error: 0.7164
Mean Absolute Percentage Error: 0.10%


In [23]:
import pickle

In [24]:
with open("../pickle/XGB_model.pkl", "wb") as f:
    pickle.dump(best_reg, f)