In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Dataset Source
The dataset used in this project was taken from Kaggle:  
[Car Price Prediction Dataset](https://www.kaggle.com/datasets/hellbuoy/car-price-prediction)


In [4]:
df = pd.read_csv("CarPrice_Assignment.csv")

In [5]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [6]:
df.shape

(205, 26)

In [7]:
df.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [8]:
df["symboling"].unique()

array([ 3,  1,  2,  0, -1, -2])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [10]:
df[["aspiration","doornumber","carbody","cylindernumber"]]

Unnamed: 0,aspiration,doornumber,carbody,cylindernumber
0,std,two,convertible,four
1,std,two,convertible,four
2,std,two,hatchback,six
3,std,four,sedan,four
4,std,four,sedan,five
...,...,...,...,...
200,std,four,sedan,four
201,turbo,four,sedan,four
202,std,four,sedan,six
203,turbo,four,sedan,six


In [11]:
df["doornumber"] = df["doornumber"].replace("two",2)
df["doornumber"] = df["doornumber"].replace("four",4)

  df["doornumber"] = df["doornumber"].replace("four",4)


In [12]:
df["cylindernumber"].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [13]:
dict = {"four":4,"six":6,"five":5,"three":3,"twelve":12,"two":2,"eight":8}
df["cylindernumber"] = df["cylindernumber"].map(dict)

In [14]:
df["cylindernumber"]

0      4
1      4
2      6
3      4
4      5
      ..
200    4
201    4
202    6
203    6
204    4
Name: cylindernumber, Length: 205, dtype: int64

In [15]:
# we will remove some columns that their correlation is greater than 0.90

In [16]:
removed_columns = set()
corr_df = df.corr(numeric_only=True)
for i in range(len(corr_df.columns)):
    for j in range(i):
        if abs(corr_df.iloc[i,j])>= 0.90:
            colname = corr_df.columns[i]
            removed_columns.add(colname)
            
        

In [17]:
removed_columns

{'highwaympg'}

In [18]:
df.drop("highwaympg",axis = 1,inplace=True)

In [19]:
df["fueltype"].unique() #one hot encoding

array(['gas', 'diesel'], dtype=object)

In [20]:
df["aspiration"].unique()  # one hot encoding

array(['std', 'turbo'], dtype=object)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    int64  
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    int64  
 16  enginesize        205 non-null    int64  
 1

In [22]:
df["carbody"].unique()   # Label encoding

array(['convertible', 'hatchback', 'sedan', 'wagon', 'hardtop'],
      dtype=object)

In [23]:
df["drivewheel"].unique() # one hot encoding

array(['rwd', 'fwd', '4wd'], dtype=object)

In [24]:
df["enginelocation"].unique() # one hot encoding

array(['front', 'rear'], dtype=object)

In [25]:
df["enginetype"].unique() # one hot encoding unique deger 1 ile 10 arasında ise  kullanılır

array(['dohc', 'ohcv', 'ohc', 'l', 'rotor', 'ohcf', 'dohcv'], dtype=object)

In [26]:
df["fuelsystem"].unique() #  one hot encoding

array(['mpfi', '2bbl', 'mfi', '1bbl', 'spfi', '4bbl', 'idi', 'spdi'],
      dtype=object)

In [27]:
df = pd.get_dummies(df,columns=["fueltype","aspiration","drivewheel","enginelocation","carbody","enginetype","fuelsystem"],drop_first=True,dtype="int")

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 40 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   car_ID               205 non-null    int64  
 1   symboling            205 non-null    int64  
 2   CarName              205 non-null    object 
 3   doornumber           205 non-null    int64  
 4   wheelbase            205 non-null    float64
 5   carlength            205 non-null    float64
 6   carwidth             205 non-null    float64
 7   carheight            205 non-null    float64
 8   curbweight           205 non-null    int64  
 9   cylindernumber       205 non-null    int64  
 10  enginesize           205 non-null    int64  
 11  boreratio            205 non-null    float64
 12  stroke               205 non-null    float64
 13  compressionratio     205 non-null    float64
 14  horsepower           205 non-null    int64  
 15  peakrpm              205 non-null    int

In [29]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,cylindernumber,...,enginetype_ohcf,enginetype_ohcv,enginetype_rotor,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,1,3,alfa-romero giulia,2,88.6,168.8,64.1,48.8,2548,4,...,0,0,0,0,0,0,0,1,0,0
1,2,3,alfa-romero stelvio,2,88.6,168.8,64.1,48.8,2548,4,...,0,0,0,0,0,0,0,1,0,0
2,3,1,alfa-romero Quadrifoglio,2,94.5,171.2,65.5,52.4,2823,6,...,0,1,0,0,0,0,0,1,0,0
3,4,2,audi 100 ls,4,99.8,176.6,66.2,54.3,2337,4,...,0,0,0,0,0,0,0,1,0,0
4,5,2,audi 100ls,4,99.4,176.6,66.4,54.3,2824,5,...,0,0,0,0,0,0,0,1,0,0


In [30]:
df["CarName"].unique()

array(['alfa-romero giulia', 'alfa-romero stelvio',
       'alfa-romero Quadrifoglio', 'audi 100 ls', 'audi 100ls',
       'audi fox', 'audi 5000', 'audi 4000', 'audi 5000s (diesel)',
       'bmw 320i', 'bmw x1', 'bmw x3', 'bmw z4', 'bmw x4', 'bmw x5',
       'chevrolet impala', 'chevrolet monte carlo', 'chevrolet vega 2300',
       'dodge rampage', 'dodge challenger se', 'dodge d200',
       'dodge monaco (sw)', 'dodge colt hardtop', 'dodge colt (sw)',
       'dodge coronet custom', 'dodge dart custom',
       'dodge coronet custom (sw)', 'honda civic', 'honda civic cvcc',
       'honda accord cvcc', 'honda accord lx', 'honda civic 1500 gl',
       'honda accord', 'honda civic 1300', 'honda prelude',
       'honda civic (auto)', 'isuzu MU-X', 'isuzu D-Max ',
       'isuzu D-Max V-Cross', 'jaguar xj', 'jaguar xf', 'jaguar xk',
       'maxda rx3', 'maxda glc deluxe', 'mazda rx2 coupe', 'mazda rx-4',
       'mazda glc deluxe', 'mazda 626', 'mazda glc', 'mazda rx-7 gs',
       'mazda glc 

In [31]:
!pip install category_encoders



In [32]:
from category_encoders import BinaryEncoder
b_encoder=  BinaryEncoder(cols = ["CarName"])
df = b_encoder.fit_transform(df)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 47 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   car_ID               205 non-null    int64  
 1   symboling            205 non-null    int64  
 2   CarName_0            205 non-null    int64  
 3   CarName_1            205 non-null    int64  
 4   CarName_2            205 non-null    int64  
 5   CarName_3            205 non-null    int64  
 6   CarName_4            205 non-null    int64  
 7   CarName_5            205 non-null    int64  
 8   CarName_6            205 non-null    int64  
 9   CarName_7            205 non-null    int64  
 10  doornumber           205 non-null    int64  
 11  wheelbase            205 non-null    float64
 12  carlength            205 non-null    float64
 13  carwidth             205 non-null    float64
 14  carheight            205 non-null    float64
 15  curbweight           205 non-null    int

In [34]:
# regression

In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.pipeline import Pipeline

In [36]:
X = df.drop("price",axis = 1) #all columns except price
y = df["price"] # target column

In [37]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=41)

In [38]:
scaler = StandardScaler()

In [39]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
regres = LinearRegression()

In [41]:
regres.fit(X_train,y_train)

In [42]:
y_predict = regres.predict(X_test)

In [43]:
lineer_score = r2_score(y_test,y_predict)

In [44]:
lineer_score # score dusuk geldi

0.5017410752879248

In [45]:
from sklearn.linear_model import RidgeCV,LassoCV,ElasticNetCV

In [46]:
ridgecv = RidgeCV(cv = 5)

In [47]:
ridgecv.fit(X_train,y_train)
y_pred = ridgecv.predict(X_test)

In [48]:
ridge_score = r2_score(y_test,y_pred)

In [49]:
ridge_score

0.6840952753661476

In [50]:
ridgecv.alpha_

np.float64(10.0)

In [51]:
ridgecv.alphas

(0.1, 1.0, 10.0)

In [52]:
lassocv= LassoCV(cv = 5)
lassocv.fit(X_train,y_train)
y_pred = lassocv.predict(X_test)
lasso_score = r2_score(y_test,y_pred)

In [53]:
lasso_score

0.7476478787589202

In [54]:
lassocv.alphas_

array([7585.60470942, 7074.36034014, 6597.57213028, 6152.91785   ,
       5738.23178001, 5351.49416322, 4990.82136745, 4654.45671098,
       4340.76190658, 4048.2090821 , 3775.37333885, 3520.92581155,
       3283.62719598, 3062.32171288, 2855.93147867, 2663.45125548,
       2483.94355512, 2316.53407297, 2160.40742962, 2014.8031995 ,
       1879.01220716, 1752.37307323, 1634.26899309, 1524.12473266,
       1421.40382675, 1325.6059661 , 1236.26456063, 1152.94446688,
       1075.23986858, 1002.77230014,  935.18880327,  872.1602079 ,
        813.3795289 ,  758.56047094,  707.43603401,  659.75721303,
        615.291785  ,  573.823178  ,  535.14941632,  499.08213674,
        465.4456711 ,  434.07619066,  404.82090821,  377.53733389,
        352.09258115,  328.3627196 ,  306.23217129,  285.59314787,
        266.34512555,  248.39435551,  231.6534073 ,  216.04074296,
        201.48031995,  187.90122072,  175.23730732,  163.42689931,
        152.41247327,  142.14038267,  132.56059661,  123.62645

In [55]:
lassocv.alpha_

np.float64(100.27723001386587)

In [56]:
elasticcv = ElasticNetCV(cv = 5)
elasticcv.fit(X_train,y_train)
y_pred = elasticcv.predict(X_test)
elastic_score = r2_score(y_test,y_pred)

In [57]:
elastic_score

0.6275381274127838

In [58]:
print("lineer regression r^2 score:",lineer_score)
print("ridge regression r^2 score:",ridge_score)
print("lasso regression r^2 score:",lasso_score)
print("elastic net regression r^2 score:",elastic_score)


lineer regression r^2 score: 0.5017410752879248
ridge regression r^2 score: 0.6840952753661476
lasso regression r^2 score: 0.7476478787589202
elastic net regression r^2 score: 0.6275381274127838


In [59]:
# polynomial regression,pipeline
from sklearn.preprocessing import PolynomialFeatures

In [60]:
def poly_regression(my_degree):
    pipeline = Pipeline([("scaler",StandardScaler()),("polyn",PolynomialFeatures(my_degree)),("regression",LinearRegression())])
    pipeline.fit(X_train,y_train)
    score =  pipeline.score(X_test,y_test)
    print(f"degree {my_degree} r^2 score: {score}")

In [61]:
for i in range(2,4):
    poly_regression(i)

degree 2 r^2 score: 0.24913215760666074
degree 3 r^2 score: -1.1235139965526328


In [62]:
# Among the tested models, Lasso Regression achieved the highest R² score (~0.74). Therefore, Lasso was selected as the final model.