In [102]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import joblib

In [103]:
df = pd.read_csv('cars_data_clean.csv')

In [104]:
df.sample(5)

Unnamed: 0,usedCarSkuId,loc,myear,body,transmission,fuel,km,ip,images,imgCount,...,Fuel Suppy System,Compression Ratio,Alloy Wheel Size,Ground Clearance Unladen,Max Power Delivered,Max Power At,Max Torque Delivered,Max Torque At,Bore,Stroke
24089,fe644639-af80-4b23-9ae7-6d6442b3a9e6,bangalore city,2017,sedan,manual,petrol,35629.0,0,[{'img': 'https://images10.gaadi.com/usedcar_i...,32,...,Multi-Point Fuel Injection,,15.0,,81.8,6000.0,113.0,4200.0,,
20210,e12ea8f4-006e-49ec-a854-2cc09f3198f2,industrial area phase 1,2015,suv,manual,diesel,97535.0,0,[{'img': 'https://images10.gaadi.com/usedcar_i...,12,...,Direct Injection,,17.0,,140.0,3750.0,320.0,2200.0,,
4253,b33ebac8-ea2c-4271-889b-b60d8856a7d7,andheri east,2020,hatchback,manual,petrol,28080.0,0,[{'img': 'http://dealeradmin.gaadi.com/user/se...,10,...,Multi-Point Fuel Injection,,,,67.0,5500.0,91.0,4250.0,,
21802,bc390f30-516c-4585-a7eb-6176392cb45a,rajkot city,2019,hatchback,manual,petrol,38692.0,0,[{'img': 'https://images10.gaadi.com/usedcar_i...,15,...,Multi-Point Fuel Injection,,,,67.05,5500.0,90.0,3500.0,69.0,
16625,5cfe7851-aeee-4060-8fa8-e44a7309e8aa,preet vihar,2019,suv,manual,petrol,51000.0,0,[{'img': 'https://images10.gaadi.com/usedcar_i...,13,...,Multi-Point Fuel Injection,,,209.0,108.5,5000.0,170.0,2875.0,77.0,85.8


In [105]:
df.isnull().sum()

usedCarSkuId                0
loc                      5850
myear                       0
body                       19
transmission                0
                        ...  
Max Power At             2247
Max Torque Delivered      231
Max Torque At            1976
Bore                    25177
Stroke                  37300
Length: 66, dtype: int64

In [106]:
df = df[['transmission','myear','fuel','km','listed_price','owner_type','Seats','Tyre Type']]

In [107]:
df.head()

Unnamed: 0,transmission,myear,fuel,km,listed_price,owner_type,Seats,Tyre Type
0,manual,2016,cng,69162.0,370000.0,first,5.0,tubeless
1,manual,2015,cng,45864.0,365000.0,first,5.0,tubeless radial
2,manual,2015,cng,81506.0,421000.0,second,5.0,tubeless radial
3,manual,2013,cng,115893.0,240000.0,second,5.0,tubeless radial
4,manual,2022,cng,18900.0,1175000.0,first,7.0,tubeless radial


In [108]:
df.isnull().sum()

transmission      0
myear             0
fuel              0
km                0
listed_price      0
owner_type        0
Seats            18
Tyre Type       257
dtype: int64

In [109]:
df.fillna(df[['Seats']].median(), inplace=True)

In [110]:
df['Tyre Type'] = df['Tyre Type'].fillna(df['Tyre Type'].mode()[0])

In [111]:
df.isnull().sum()

transmission    0
myear           0
fuel            0
km              0
listed_price    0
owner_type      0
Seats           0
Tyre Type       0
dtype: int64

In [112]:
df.sample(5)

Unnamed: 0,transmission,myear,fuel,km,listed_price,owner_type,Seats,Tyre Type
34083,manual,2011,petrol,55423.0,197900.0,first,5.0,tubeless radial
4109,automatic,2018,petrol,46479.0,437000.0,second,5.0,tubeless radial
32643,manual,2017,petrol,56390.0,500000.0,first,5.0,tubeless radial
35653,manual,2015,diesel,61716.0,550000.0,first,5.0,tubeless radial
20700,manual,2019,diesel,90000.0,635000.0,first,5.0,tubeless


In [113]:
df['Car Age'] = 2025-df['myear']
df.drop(['myear'], axis=1, inplace=True)

In [114]:
df.sample(5)

Unnamed: 0,transmission,fuel,km,listed_price,owner_type,Seats,Tyre Type,Car Age
14049,manual,diesel,101015.0,435000.0,first,5.0,tubeless radial,10
28597,manual,petrol,27053.0,270000.0,second,5.0,tubeless radial,15
37775,automatic,petrol,27507.0,850000.0,first,5.0,tubeless radial,6
15498,automatic,diesel,83856.0,896710.0,second,5.0,tubeless,8
11763,automatic,petrol,15000.0,1650000.0,first,5.0,tubeless radial,3


In [115]:
from sklearn.preprocessing import LabelEncoder

In [116]:
le = LabelEncoder()

In [117]:
df['transmission'] = le.fit_transform(df['transmission']) 

In [118]:
df['fuel'] = le.fit_transform(df['fuel']) 

In [119]:
df['owner_type'] = le.fit_transform(df['owner_type'])

In [120]:
df['Tyre Type'] = le.fit_transform(df['Tyre Type']) 

In [121]:
df.sample(5)

Unnamed: 0,transmission,fuel,km,listed_price,owner_type,Seats,Tyre Type,Car Age
21378,1,1,39653.0,500000.0,1,7.0,3,10
29053,0,4,69000.0,585000.0,1,5.0,3,10
36230,0,4,65169.0,360000.0,1,5.0,2,8
11260,1,4,50000.0,190000.0,1,5.0,3,13
19759,1,1,83111.0,620000.0,1,5.0,3,9


In [122]:
X = df.drop('listed_price',axis=1)
y = df['listed_price']

In [123]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [124]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [125]:
from sklearn.metrics import r2_score, mean_absolute_error

In [126]:
y_pred = model.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

R2 Score: -1.6470997431276868
Mean Absolute Error: 341458.2689710248


In [127]:
sample = X_test.iloc[0:1]
predicted = model.predict(sample)
print("Sample Prediction:", predicted)

Sample Prediction: [746927.1]


In [130]:
import pickle
pickle.dump(model, open("car_price_model.pkl", "wb"))