Linear Regression

In [2]:
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("Car_Price_Prediction.csv")
data.head()

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
0,Honda,Model B,2015,3.9,74176,Petrol,Manual,30246.207931
1,Ford,Model C,2014,1.7,94799,Electric,Automatic,22785.747684
2,BMW,Model B,2006,4.1,98385,Electric,Manual,25760.290347
3,Honda,Model B,2015,2.6,88919,Electric,Automatic,25638.003491
4,Honda,Model C,2004,3.4,138482,Petrol,Automatic,21021.386657


In [3]:
onehot = OneHotEncoder(sparse_output=False)
columns = ["Make", "Model", "Fuel Type", "Transmission"]
for i in columns:
    encoded = onehot.fit_transform(data[[i]])
    features = onehot.get_feature_names_out([i])
    encoded = pd.DataFrame(encoded, columns=features, index=data.index)  # index=data.index is needed so as to preserve the index to align it with our data's index
    data.drop(columns=[i], inplace=True)
    data = pd.concat([data, encoded], axis = 1)

data.head()

Unnamed: 0,Year,Engine Size,Mileage,Price,Make_Audi,Make_BMW,Make_Ford,Make_Honda,Make_Toyota,Model_Model A,Model_Model B,Model_Model C,Model_Model D,Model_Model E,Fuel Type_Diesel,Fuel Type_Electric,Fuel Type_Petrol,Transmission_Automatic,Transmission_Manual
0,2015,3.9,74176,30246.207931,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,2014,1.7,94799,22785.747684,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2006,4.1,98385,25760.290347,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,2015,2.6,88919,25638.003491,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,2004,3.4,138482,21021.386657,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
columns = ["Year", "Engine Size", "Mileage"]
data[columns] = scaler.fit_transform(data[columns])
    

data.head()

Unnamed: 0,Year,Engine Size,Mileage,Price,Make_Audi,Make_BMW,Make_Ford,Make_Honda,Make_Toyota,Model_Model A,Model_Model B,Model_Model C,Model_Model D,Model_Model E,Fuel Type_Diesel,Fuel Type_Electric,Fuel Type_Petrol,Transmission_Automatic,Transmission_Manual
0,0.686031,1.076274,-0.387368,30246.207931,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.526933,-1.072952,-0.040282,22785.747684,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,-0.745852,1.271658,0.02007,25760.290347,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.686031,-0.193723,-0.139243,25638.003491,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,-1.064048,0.587813,0.694904,21021.386657,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [5]:
from sklearn.model_selection import train_test_split

x = data.drop(columns=["Price"])
y = data["Price"]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from numpy import sqrt

model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_squared_error(y_test,y_pred))
print(sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test,y_pred))

5042089.098729371
2245.4596631267664
0.8157581353511812


Decision Tree

In [9]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=10, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_squared_error(y_test, y_pred))
print(sqrt(mean_squared_error(y_test, y_pred)))
print(r2_score(y_test, y_pred))

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

gs = GridSearchCV(DecisionTreeRegressor(), {"max_depth": range(5,15)}, cv=5)
gs.fit(x,y)

print(gs.best_estimator_)

model = DecisionTreeRegressor(max_depth=6, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(mean_squared_error(y_test, y_pred))
print(sqrt(mean_squared_error(y_test, y_pred)))
print(r2_score(y_test, y_pred))

10204583.539641134
3194.4613849037423
0.6271165656747473
DecisionTreeRegressor(max_depth=6)
6899718.077918661
2626.7314438135204
0.74787892491879
