In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pickle


In [4]:
df = pd.read_csv("../data/Car details v3.csv")
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [5]:
# Drop duplicates & missing values
df = df.drop_duplicates()
df = df.dropna()

# Clean numeric columns (remove units)
def extract_numeric(x):
    try:
        return float(str(x).split()[0])
    except:
        return np.nan

df['mileage'] = df['mileage'].apply(extract_numeric)
df['engine'] = df['engine'].apply(extract_numeric)
df['max_power'] = df['max_power'].apply(extract_numeric)

# Drop columns not useful for prediction
df = df.drop(['torque', 'name'], axis=1)

# Encode categorical features
le = LabelEncoder()
for col in ['fuel', 'seller_type', 'transmission', 'owner']:
    df[col] = le.fit_transform(df[col])

df.head()


Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,2014,450000,145500,1,1,1,0,23.4,1248.0,74.0,5.0
1,2014,370000,120000,1,1,1,2,21.14,1498.0,103.52,5.0
2,2006,158000,140000,3,1,1,4,17.7,1497.0,78.0,5.0
3,2010,225000,127000,1,1,1,0,23.0,1396.0,90.0,5.0
4,2007,130000,120000,3,1,1,0,16.1,1298.0,88.2,5.0


In [6]:
X = df.drop('selling_price', axis=1)
y = df['selling_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

import pickle
import os

# make sure models directory exists
os.makedirs("../models", exist_ok=True)

# save trained model
with open("../models/car_price_model.pkl", "wb") as f:
    pickle.dump(rf, f)

print("Model saved successfully!")




Model saved successfully!


In [11]:
print("Linear Regression R2:", r2_score(y_test, y_pred_lr))
print("Random Forest R2:", r2_score(y_test, y_pred_rf))
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))


Linear Regression R2: 0.6603023066481568
Random Forest R2: 0.9188603992370101
Random Forest RMSE: 133433.5994513423


In [13]:
with open("../models/car_price_model.pkl", "wb") as f:
    pickle.dump(rf, f)
