In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [None]:

# Load dataset
df = pd.read_csv("dataset.csv")
print("Shape of dataset:", df.shape)
df.head()


In [None]:

# Basic info
df.info()
df.describe()


In [None]:

# Check missing values
df.isnull().sum()


In [None]:

# Drop rows with missing price and fill other missing values
df.dropna(subset=['price'], inplace=True)
df.fillna('Unknown', inplace=True)

# Feature engineering
df['age'] = 2025 - df['year']
df.drop(columns=['name', 'description', 'year'], inplace=True)


In [None]:

X = df.drop('price', axis=1)
y = df['price']


In [None]:

categorical = X.select_dtypes(include=['object']).columns.tolist()
numerical = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numerical)
])


In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)


In [None]:

y_pred = model.predict(X_test)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


In [None]:

joblib.dump(model, 'vehicle_price_predictor.pkl')


In [None]:

sample = X_test.iloc[0:1]
predicted_price = model.predict(sample)
print("Predicted Price:", predicted_price[0])
