In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score


In [2]:
# 2. Load Dataset
vehicles_data = pd.read_csv("vehicles.csv")


In [14]:
# 2. Prepare features and target
X = vehicles_data.drop(["id", "url", "region_url", "price", "image_url", "description", "county", "VIN"], axis=1)
y = vehicles_data["price"]

# 3. Define imputers and features
median_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
mean_imputer = SimpleImputer(strategy="mean")

median_features = ["year"]
cat_features = ["fuel", "manufacturer", "model", "condition", "paint_color", "cylinders", "type", "title_status", "transmission", "drive", "size", "region"]
mean_features = ["odometer"]

# 4. Impute missing values with ColumnTransformer
imputer = ColumnTransformer([
    ("median_imputer", median_imputer, median_features),
    ("cat_imputer", cat_imputer, cat_features),
    ("mean_imputer", mean_imputer, mean_features)
])

filled_X = imputer.fit_transform(X)

# create dataframe to keep column names (important for ColumnTransformer next step)
columns = median_features + cat_features + mean_features
filled_df = pd.DataFrame(filled_X, columns=columns)

# 5. One-hot encode categorical features
one_hot = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

transformer = ColumnTransformer([
    ("onehot", one_hot, cat_features)
], remainder="passthrough")

transformed_X = transformer.fit_transform(filled_df)



# 6. Split dataset
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.3, random_state=42)

# 7. Prepare LightGBM datasets (accepts sparse input)
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

# 8. LightGBM parameters (tweak as needed)
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'verbose': -1,
    'seed': 42
}

# 9. Train model

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    callbacks=[lgb.early_stopping(stopping_rounds=20)]
)


# 10. Predict and evaluate
y_pred = gbm.predict(X_test)
score = r2_score(y_test, y_pred)

print(f"LightGBM R² Score: {score:.4f}")

Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	training's rmse: 6442.66	valid_1's rmse: 6434.15
LightGBM R² Score: 0.8008
