In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,LassoCV, RidgeCV,ElasticNet
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('ToyotaCorolla.csv') # loading the dataset into 'data' variable

In [3]:
df.info()
# as we can see in the output, all the columns are non-null and the data types of all the columns are correct, 
# so lets do the EDA for it.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Id                 1436 non-null   int64 
 1   Model              1436 non-null   object
 2   Price              1436 non-null   int64 
 3   Age_08_04          1436 non-null   int64 
 4   Mfg_Month          1436 non-null   int64 
 5   Mfg_Year           1436 non-null   int64 
 6   KM                 1436 non-null   int64 
 7   Fuel_Type          1436 non-null   object
 8   HP                 1436 non-null   int64 
 9   Met_Color          1436 non-null   int64 
 10  Color              1436 non-null   object
 11  Automatic          1436 non-null   int64 
 12  CC                 1436 non-null   int64 
 13  Doors              1436 non-null   int64 
 14  Cylinders          1436 non-null   int64 
 15  Gears              1436 non-null   int64 
 16  Quarterly_Tax      1436 non-null   int64 


In [4]:
for col in df.columns:
    print(df[col].value_counts())
    print('-------------------------------------')

Id
1       1
2       1
3       1
4       1
5       1
       ..
1438    1
1439    1
1440    1
1441    1
1442    1
Name: count, Length: 1436, dtype: int64
-------------------------------------
Model
TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors    109
TOYOTA Corolla 1.3 16V HATCHB LINEA TERRA 2/3-Doors     84
TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors       80
TOYOTA Corolla 1.6 16V LIFTB LINEA TERRA 4/5-Doors      71
TOYOTA Corolla 1.4 16V VVT I HATCHB TERRA 2/3-Doors     54
                                                      ... 
TOYOTA Corolla 1.8 VVTL-i T-Sport 3-Drs 2/3-Doors        1
TOYOTA Corolla 2.0 D4D 90 5DR TERRA 4/5-Doors            1
TOYOTA Corolla 1800 T SPORT VVT I 2/3-Doors              1
TOYOTA Corolla 1.8 16V VVTLI 5DR T SPORT 4/5-Doors       1
TOYOTA Corolla 1.3 Linea Terra 4/5-Doors                 1
Name: count, Length: 319, dtype: int64
-------------------------------------
Price
8950     109
9950      84
7950      63
10950     62
11950     47
      

In [5]:
# lets see if model is column to be removed 
df.Model.value_counts(sort=True)
# As we can see there are 319 different models of cars,
#  it will affect the prediction because most of the people will see the model before buying. 
# At the moement, we will drop the model column and see if any algorithms are there to include the column.

Model
TOYOTA Corolla 1.6 16V HATCHB LINEA TERRA 2/3-Doors    109
TOYOTA Corolla 1.3 16V HATCHB LINEA TERRA 2/3-Doors     84
TOYOTA Corolla 1.6 16V LIFTB LINEA LUNA 4/5-Doors       80
TOYOTA Corolla 1.6 16V LIFTB LINEA TERRA 4/5-Doors      71
TOYOTA Corolla 1.4 16V VVT I HATCHB TERRA 2/3-Doors     54
                                                      ... 
TOYOTA Corolla 1.8 VVTL-i T-Sport 3-Drs 2/3-Doors        1
TOYOTA Corolla 2.0 D4D 90 5DR TERRA 4/5-Doors            1
TOYOTA Corolla 1800 T SPORT VVT I 2/3-Doors              1
TOYOTA Corolla 1.8 16V VVTLI 5DR T SPORT 4/5-Doors       1
TOYOTA Corolla 1.3 Linea Terra 4/5-Doors                 1
Name: count, Length: 319, dtype: int64

In [6]:
# As we can see that the cylinders column has only one value, we will remove this column and ID column which is irrelevent to our target variable
df.drop(['Id','Cylinders','Model'],axis=1, inplace=True)

In [7]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [8]:
# converting categorical cols Feuel type and color into numerical with one hot encoding
encoded_df = pd.get_dummies(df, columns=['Fuel_Type','Color'], drop_first= True)

In [9]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1435 entries, 0 to 1435
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Price              1435 non-null   int64
 1   Age_08_04          1435 non-null   int64
 2   Mfg_Month          1435 non-null   int64
 3   Mfg_Year           1435 non-null   int64
 4   KM                 1435 non-null   int64
 5   HP                 1435 non-null   int64
 6   Met_Color          1435 non-null   int64
 7   Automatic          1435 non-null   int64
 8   CC                 1435 non-null   int64
 9   Doors              1435 non-null   int64
 10  Gears              1435 non-null   int64
 11  Quarterly_Tax      1435 non-null   int64
 12  Weight             1435 non-null   int64
 13  Mfr_Guarantee      1435 non-null   int64
 14  BOVAG_Guarantee    1435 non-null   int64
 15  Guarantee_Period   1435 non-null   int64
 16  ABS                1435 non-null   int64
 17  Airbag_1           

In [10]:
correlation = encoded_df.corrwith(encoded_df['Price']).sort_values()

In [11]:
correlation

Age_08_04           -0.876273
KM                  -0.569420
Tow_Bar             -0.171618
Color_Green         -0.104368
Color_White         -0.103511
Color_Red           -0.103026
Radio_cassette      -0.042281
Radio               -0.040979
Fuel_Type_Petrol    -0.031356
Color_Violet        -0.016794
Mfg_Month           -0.014518
Color_Blue           0.015872
Color_Yellow         0.022974
Color_Silver         0.029554
Automatic            0.033928
Color_Black          0.036163
BOVAG_Guarantee      0.036439
Parking_Assistant    0.044760
Fuel_Type_Diesel     0.046612
Gears                0.063831
Power_Steering       0.064208
Airbag_1             0.093618
Met_Color            0.107604
Backseat_Divider     0.108280
Metallic_Rim         0.110555
CC                   0.124375
Guarantee_Period     0.148167
Color_Grey           0.165747
Sport_Model          0.166832
Doors                0.183604
Mfr_Guarantee        0.201207
Quarterly_Tax        0.211508
Mistlamps            0.224925
Airbag_2  

In [12]:
#lets use lasso regression for feature reduction beacuse we have too many features present in our dataset
# splitting the data into train and test before training the model
X = encoded_df.drop('Price',axis=1) # defining features
y = encoded_df.Price                # defining target
# splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=42) 
# checking shape of each 
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
print('-------------------------------------------------------')
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

(1148, 44) (287, 44) (1148,) (287,)
-------------------------------------------------------


In [13]:
# we will first use linear regression model 
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)
print(f'linear_regression_r2_score: {r2_score(y_test,y_pred_lr)}')

linear_regression_r2_score: 0.8834214605626209


In [14]:
lassocv = LassoCV(cv=5, random_state=42) # model Lasso with Cross Validation of fold 5
lassocv.fit(X_train,y_train)
y_pred_lasso = lassocv.predict(X_test)
print(f'lasso_r2_score: {r2_score(y_test,y_pred_lasso)}')
print('--------------------------------------')
selected_features = pd.Series(lassocv.coef_, index=X.columns)
print("Selected Features (non-zero coefficients):")
print(selected_features[selected_features != 0].sort_values(ascending=False))

lasso_r2_score: 0.3062219429389753
--------------------------------------
Selected Features (non-zero coefficients):
CC    0.559458
KM   -0.056104
dtype: float64


In [15]:
# As we can see lasso regression deleted all columns except CC and KM, we will try with RidgeCV
ridge_cv = RidgeCV(cv=5)
ridge_cv.fit(X_train,y_train)
y_pred_ridge = ridge_cv.predict(X_test)
print(f'ridge_r2_score: {r2_score(y_pred_ridge,y_test)}')

ridge_r2_score: 0.8819446677043363


##### The value of r2 score of linear regression is 0.8834 and ridge is 0.8819. 
##### So we can use ridge or linear regression model to predict the prices of cars.