In [8]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from tqdm import tqdm

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from tqdm import tqdm

In [9]:
# Load the dataset
train_data = pd.read_csv('../data/processed/train.csv')
train_data.head()

Unnamed: 0,brand,model,milage,fuel_type,ext_col,int_col,accident,log_price,age,horsepower,engine_size,num_cylinders,transmission_type,num_speeds
0,mini,cooper s base,213000,gasoline,yellow,gray,0,8.343078,17,172.0,1.6,4.0,automatic,7.0
1,lincoln,other,143250,gasoline,silver,beige,1,8.517193,22,252.0,3.9,8.0,automatic,7.0
2,chevrolet,other,136731,e85 flex fuel,blue,gray,0,9.539716,22,320.0,5.3,8.0,automatic,7.0
3,genesis,other,19500,gasoline,black,black,0,10.71444,7,420.0,5.0,8.0,manual,7.0
4,mercedes-benz,metris base,7388,gasoline,black,beige,0,11.487618,3,208.0,2.0,4.0,automatic,7.0


In [10]:
target = 'log_price'

# Split features and target variable
X = train_data.drop(columns=[target])
y = train_data[target]

In [11]:
# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# One-hot encode categorical variables
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)

# Align the train and validation sets to ensure they have the same columns
X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

X_train_scaled.shape

(150826, 642)

## Baseline Model: Linear Regression

In [6]:
# Model building
model = LinearRegression()
model.fit(X_train_scaled, y_train)

score = cross_val_score(model, X_train_scaled, y_train, cv=5, n_jobs = -1, scoring='neg_mean_squared_error')
np.sqrt(np.abs(score).mean())

0.5117302519741233

In [7]:
# Model building
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on validation set
y_pred_log = model.predict(X_val_scaled)

# Reverse the log transformation (using expm1 which is the inverse of log1p)
y_pred_original = np.expm1(y_pred_log)
y_val_original = np.expm1(y_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val_original, y_pred_original))
print(f"\n--- RMSE on Validation Set: {rmse:.2f} ---\n")


--- RMSE on Validation Set: 69323.80 ---



## Import Test Set

In [12]:
# Load the dataset
test_data = pd.read_csv('../data/processed/test.csv')
test_data.head()

Unnamed: 0,id,brand,model,milage,fuel_type,ext_col,int_col,accident,age,horsepower,engine_size,num_cylinders,transmission_type,num_speeds
0,188533,land,other,98000,gasoline,white,beige,0,9,240.0,2.0,4.0,automatic,6.0
1,188534,land,rover defender se,9142,hybrid,silver,black,0,4,395.0,3.0,6.0,automatic,8.0
2,188535,ford,expedition limited,28121,gasoline,white,ebony,0,2,328.0,3.5,6.0,automatic,10.0
3,188536,audi,other,61258,gasoline,other,black,0,8,328.0,3.5,6.0,automatic,7.0
4,188537,audi,a6 2.0t premium plus,59000,gasoline,gray,black,0,6,252.0,2.0,4.0,automatic,7.0


In [14]:
# Make sure the features match the training set
X_test = test_data

# One-hot encode categorical variables for the test set
X_test = pd.get_dummies(X_test, drop_first=True)

# Align the test set with the training set to ensure all columns are present
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Feature scaling for the test set
X_test_scaled = scaler.transform(X_test)

X_test_scaled.shape

(125690, 642)

In [20]:

# Generate predictions using the best model
y_test_pred_log = model.predict(X_test_scaled)

# Reverse the log transformation to get predictions on the original scale
y_test_pred = np.expm1(y_test_pred_log)

# Create a DataFrame with the test set predictions
predictions = pd.DataFrame({'id': test_data['id'], 'price': y_test_pred})

# Save predictions to a CSV file
predictions.to_csv('test_set_predictions.csv', index=False)

print("Predictions saved to 'test_set_predictions.csv'")

Predictions saved to 'test_set_predictions.csv'


### Model Evaluation

In [None]:
# Model evaluation setup
models = {
#    'Linear Regression': LinearRegression(),
#    'Ridge Regression': Ridge(alpha=1.0),
#    'Lasso Regression': Lasso(alpha=0.1),
#    'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5),
#    'Random Forest': RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42),
    'AdaBoost': AdaBoostRegressor(n_estimators=50, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=50, max_depth=3, random_state=42, use_label_encoder=False, eval_metric='rmse')
}

rmse_scorer = make_scorer(mean_squared_error, squared=False)

In [None]:
# Evaluate models using cross-validation with a progress bar
best_model_name = None
best_mean_rmse = float('inf')
for model_name, model in tqdm(models.items(), desc="Evaluating Models"):
    with tqdm(total=5, desc=f"Cross-validating {model_name}", leave=False) as pbar:
        scores = []
        for _ in range(5):
            score = cross_val_score(model, X_train_scaled, y_train, cv=5, n_jobs = -1, scoring=rmse_scorer)
            scores.append(np.mean(score))
            pbar.update(1)
        mean_rmse = np.mean(scores)
        print(f"\n--- {model_name} ---\nRMSE (Cross-Validation, Log Scale): {mean_rmse:.2f}")
        if mean_rmse < best_mean_rmse:
            best_mean_rmse = mean_rmse
            best_model_name = model_name

print(f"\n--- Best Model: {best_model_name} with RMSE (Log Scale): {best_mean_rmse:.2f} ---\n")

In [None]:

# Train and evaluate the best-performing model on the validation set
best_model = models[best_model_name]
best_model.fit(X_train_scaled, y_train)

# Predict on validation set
y_pred_log = best_model.predict(X_val_scaled)

# RMSE on the log-transformed scale
rmse_log = np.sqrt(mean_squared_error(y_val, y_pred_log))
print(f"\n--- RMSE on Validation Set (Best Model - {best_model_name}, Log Scale): {rmse_log:.2f} ---\n")