## Data preparation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import numpy as np

# Load the dataset
file_path = '/datasets/car_data.csv'
car_data = pd.read_csv(file_path)

# Handle missing values
categorical_columns = ['VehicleType', 'Gearbox', 'Model', 'FuelType', 'NotRepaired']
for column in categorical_columns:
    car_data[column].fillna('unknown', inplace=True)

# Limit the number of unique values in categorical features
for column in categorical_columns:
    top_values = car_data[column].value_counts().index[:5]
    car_data[column] = car_data[column].apply(lambda x: x if x in top_values else 'other')

# Select a subset of relevant features (including a limited number of numerical features)
selected_columns = ['Price', 'VehicleType', 'RegistrationYear', 'Gearbox', 'Power', 'Model', 'Mileage', 'FuelType', 'NotRepaired']
car_data_reduced = car_data[selected_columns]

# Apply OneHotEncoding to categorical columns
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_columns = encoder.fit_transform(car_data_reduced[categorical_columns])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names(categorical_columns))
car_data_encoded = pd.concat([car_data_reduced.drop(categorical_columns, axis=1), encoded_df], axis=1)

# Ensure the number of columns is below 200
print(f"Number of columns: {car_data_encoded.shape[1]}")

# Sample a smaller dataset (use 5% of the data)
car_data_sampled = car_data_encoded.sample(frac=0.05, random_state=42)

# Define features and target
X = car_data_sampled.drop(['Price'], axis=1)
y = car_data_sampled['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Number of columns: 22


## Model training

In [2]:
# Display the shapes of the training and testing sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(14174, 21) (3544, 21) (14174,) (3544,)


In [3]:
# Train a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Linear Regression RMSE: {rmse}')



Linear Regression RMSE: 3589.0843233203823


In [4]:
# Train a LightGBM model
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print(f'LightGBM RMSE: {rmse_lgb}')

LightGBM RMSE: 2190.224815559407


# Tuning hyperparameters

In [5]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for LightGBM
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200]
}

# Initialize the LightGBM model
lgb_model = lgb.LGBMRegressor(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and the best RMSE score
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)

print(f'Best parameters: {best_params}')
print(f'Best RMSE: {best_rmse}')

# Train the LightGBM model with the best parameters
lgb_model_best = lgb.LGBMRegressor(**best_params, random_state=42)
lgb_model_best.fit(X_train, y_train)
y_pred_lgb_best = lgb_model_best.predict(X_test)
rmse_lgb_best = np.sqrt(mean_squared_error(y_test, y_pred_lgb_best))
print(f'Tuned LightGBM RMSE: {rmse_lgb_best}')


Best parameters: {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}
Best RMSE: 2151.051880210941
Tuned LightGBM RMSE: 2190.224815559407


## Model analysis

In [6]:
import time
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize results dictionary
results = {}

# Linear Regression
start_time = time.time()
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
training_time = time.time() - start_time

start_time = time.time()
y_pred = linear_model.predict(X_test)
prediction_time = time.time() - start_time

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
results['Linear Regression'] = {'RMSE': rmse, 'Training Time': training_time, 'Prediction Time': prediction_time}

print(f"Linear Regression - RMSE: {rmse}, Training Time: {training_time}, Prediction Time: {prediction_time}")

# LightGBM
start_time = time.time()
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X_train, y_train)
training_time = time.time() - start_time

start_time = time.time()
y_pred_lgb = lgb_model.predict(X_test)
prediction_time = time.time() - start_time

rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
results['LightGBM'] = {'RMSE': rmse_lgb, 'Training Time': training_time, 'Prediction Time': prediction_time}

print(f"LightGBM - RMSE: {rmse_lgb}, Training Time: {training_time}, Prediction Time: {prediction_time}")

# Display results
import pandas as pd
results_df = pd.DataFrame(results).T
print(results_df)


Linear Regression - RMSE: 3589.0843233203823, Training Time: 0.020336627960205078, Prediction Time: 0.006487369537353516
LightGBM - RMSE: 2190.224815559407, Training Time: 0.5683648586273193, Prediction Time: 0.07948708534240723
                          RMSE  Training Time  Prediction Time
Linear Regression  3589.084323       0.020337         0.006487
LightGBM           2190.224816       0.568365         0.079487


# Conclusion

Quality (RMSE)
Linear Regression has a higher RMSE (3589.08) compared to LightGBM (2190.22), indicating that LightGBM provides better predictive performance.
Speed
Training Time:
Linear Regression is significantly faster to train (0.0203 seconds) compared to LightGBM (0.5684 seconds).
Prediction Time:
Linear Regression also has a quicker prediction time (0.0065 seconds) compared to LightGBM (0.0795 seconds).

Linear Regression:
Pros: Very fast training and prediction times.
Cons: Higher RMSE, indicating lower prediction quality.
LightGBM:
Pros: Lower RMSE, indicating higher prediction quality.
Cons: Longer training and prediction times compared to Linear Regression, but still reasonable for many applications.


If prediction quality is the primary concern, LightGBM is the better choice due to its significantly lower RMSE.
If speed (both training and prediction) is critical and the quality can be compromised slightly, Linear Regression might be preferred.