In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [2]:

# Load dataset
df = pd.read_csv("dataset.csv")
print("Shape of dataset:", df.shape)
df.head()


Shape of dataset: (1002, 17)


Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,2024 Jeep Grand Cherokee Laredo,Al West is committed to offering every custome...,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,2024 GMC Yukon XL Denali,,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive
3,2023 Dodge Durango Pursuit,White Knuckle Clearcoat 2023 Dodge Durango Pur...,Dodge,Durango,2023,46835.0,16V MPFI OHV,8.0,Gasoline,32.0,8-Speed Automatic,Pursuit,SUV,4.0,White Knuckle Clearcoat,Black,All-wheel Drive
4,2024 RAM 3500 Laramie,\n \n 2024 Ram 3500 Laramie Billet...,RAM,3500,2024,81663.0,24V DDI OHV Turbo Diesel,6.0,Diesel,10.0,6-Speed Automatic,Laramie,Pickup Truck,4.0,Silver,Black,Four-wheel Drive


In [3]:

# Drop rows where 'price' is missing
df.dropna(subset=['price'], inplace=True)

# Fill numeric columns with 0
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(0)

# Fill categorical columns with 'Unknown' and convert them to string
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df[categorical_cols] = df[categorical_cols].astype(str).fillna('Unknown')

# Feature engineering
df['age'] = 2025 - df['year']
df.drop(columns=['name', 'description', 'year'], inplace=True)


In [4]:

# Basic info
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
Index: 979 entries, 0 to 1001
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   make            979 non-null    object 
 1   model           979 non-null    object 
 2   price           979 non-null    float64
 3   engine          979 non-null    object 
 4   cylinders       979 non-null    float64
 5   fuel            979 non-null    object 
 6   mileage         979 non-null    float64
 7   transmission    979 non-null    object 
 8   trim            979 non-null    object 
 9   body            979 non-null    object 
 10  doors           979 non-null    float64
 11  exterior_color  979 non-null    object 
 12  interior_color  979 non-null    object 
 13  drivetrain      979 non-null    object 
 14  age             979 non-null    int64  
dtypes: float64(4), int64(1), object(10)
memory usage: 122.4+ KB


Unnamed: 0,price,cylinders,mileage,doors,age
count,979.0,979.0,979.0,979.0,979.0
mean,50202.9857,4.458631,65.009193,3.916241,1.083759
std,18700.392062,2.013521,502.032899,0.427973,0.298482
min,0.0,0.0,0.0,0.0,0.0
25%,36600.0,4.0,3.0,4.0,1.0
50%,47165.0,4.0,7.0,4.0,1.0
75%,58919.5,6.0,12.0,4.0,1.0
max,195895.0,8.0,9711.0,5.0,2.0


In [6]:
# Drop rows with missing price
df.dropna(subset=['price'], inplace=True)

# Fill numeric columns with 0
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(0)

# Fill categorical columns with 'Unknown'
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# Feature engineering: Only do this if 'year' exists
if 'year' in df.columns:
    df['age'] = 2025 - df['year']

# Drop optional columns if they exist
optional_cols = ['name', 'description', 'year']
df.drop(columns=[col for col in optional_cols if col in df.columns], inplace=True)


In [7]:

X = df.drop('price', axis=1)
y = df['price']


In [8]:

categorical = X.select_dtypes(include=['object']).columns.tolist()
numerical = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numerical)
])


In [9]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)


In [11]:

y_pred = model.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


RMSE: 8277.854734971419
R² Score: 0.7756220093367592


In [12]:

joblib.dump(model, 'vehicle_price_predictor.pkl')


['vehicle_price_predictor.pkl']

In [13]:

sample = X_test.iloc[0:1]
predicted_price = model.predict(sample)
print("Predicted Price:", predicted_price[0])


Predicted Price: 29823.55
