# **🚗 Car Price Prediction Using NLP**

# 1: Install necessary packages

In [1]:
!pip install pandas scikit-learn xgboost



# 2: Import libraries

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# 3: Load dataset

In [3]:
df = pd.read_csv('car data.csv')

In [4]:
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.642584,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [7]:
df.isnull().sum()

Unnamed: 0,0
Car_Name,0
Year,0
Selling_Price,0
Present_Price,0
Driven_kms,0
Fuel_Type,0
Selling_type,0
Transmission,0
Owner,0


# 4: NLP Preprocessing

In [8]:
X = df[['Car_Name', 'Year', 'Present_Price', 'Driven_kms', 'Fuel_Type', 'Selling_type', 'Transmission', 'Owner']]
y = df['Selling_Price']

# 5: Split dataset

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6: Preprocessing steps

In [10]:
text_features = 'Car_Name'
categorical_features = ['Fuel_Type', 'Selling_type', 'Transmission']
numeric_features = ['Year', 'Present_Price', 'Driven_kms', 'Owner']

In [13]:
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(max_features=50), 'Car_Name'),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

# 7: Create pipeline

In [14]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(n_estimators=100, random_state=42))
])

# 8: Train the model

In [15]:
model.fit(X_train, y_train)

# 9: Evaluate the model

In [16]:
y_pred = model.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

R2 Score: 0.9668271316216492
Mean Absolute Error: 0.5513578985460469


# 10: Predict on new car

In [17]:
sample_input = pd.DataFrame([{
    'Car_Name': 'Maruti Swift Dzire',
    'Year': 2019,
    'Present_Price': 7.5,
    'Driven_kms': 25000,
    'Fuel_Type': 'Petrol',
    'Selling_type': 'Dealer',
    'Transmission': 'Manual',
    'Owner': 0
}])

predicted_price = model.predict(sample_input)[0]
print(f"Predicted Selling Price: ₹ {round(predicted_price, 2)} Lakh")

Predicted Selling Price: ₹ 5.960000038146973 Lakh
