## Analysis Content
1. [Import the necessary libraries:](#1)
1. [Load the data into a pandas DataFrame:](#2)
1. [Explore & Clean the data](#3)
1. [Encoding:](#4)
1. [Train-Test Split:](#5)
1. [Building a Model:](#6)



<a id="1"></a>
## Import the necessary libraries:

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score,mean_squared_error

<a id="2"></a>
## Load the data into a pandas DataFrame:

In [3]:
df = pd.read_csv("CarPrice.csv")
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


<a id="3"></a>
## Explore & Clean the data:

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [5]:
df.describe()

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,103.0,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,3.329756,3.255415,10.142537,104.117073,5125.121951,25.219512,30.75122,13276.710571
std,59.322565,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,0.270844,0.313597,3.97204,39.544167,476.985643,6.542142,6.886443,7988.852332
min,1.0,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,52.0,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,3.15,3.11,8.6,70.0,4800.0,19.0,25.0,7788.0
50%,103.0,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10295.0
75%,154.0,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,3.58,3.41,9.4,116.0,5500.0,30.0,34.0,16503.0
max,205.0,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


 <a id="4"></a>
 ## Encoding

In [6]:
y = df['price']
x = df.drop(labels = ['price',"CarName","car_ID"],axis=1)

In [7]:
le = LabelEncoder()
x_cat = x.select_dtypes(include='object')
x_cat.head()

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem
0,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [8]:
for i in x_cat.columns:
    print(f"The no.of unique values in {i} are: {x_cat.loc[:,i].nunique()}")

The no.of unique values in fueltype are: 2
The no.of unique values in aspiration are: 2
The no.of unique values in doornumber are: 2
The no.of unique values in carbody are: 5
The no.of unique values in drivewheel are: 3
The no.of unique values in enginelocation are: 2
The no.of unique values in enginetype are: 7
The no.of unique values in cylindernumber are: 7
The no.of unique values in fuelsystem are: 8


In [9]:
for i in x_cat.columns:    
    x_cat[i] = le.fit_transform(x_cat[i])
x_cat.head()

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem
0,1,0,1,0,2,0,0,2,5
1,1,0,1,0,2,0,0,2,5
2,1,0,1,2,2,0,5,3,5
3,1,0,0,3,1,0,3,2,5
4,1,0,0,3,0,0,3,1,5


In [10]:
x_num = x.select_dtypes(include=np.number)
x_num.head()

Unnamed: 0,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
0,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
2,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26
3,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30
4,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22


In [11]:
df = pd.concat([x_cat,x_num],axis=1)
df.head()

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem,symboling,...,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
0,1,0,1,0,2,0,0,2,5,3,...,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
1,1,0,1,0,2,0,0,2,5,3,...,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27
2,1,0,1,2,2,0,5,3,5,1,...,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26
3,1,0,0,3,1,0,3,2,5,2,...,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30
4,1,0,0,3,0,0,3,1,5,2,...,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22


In [12]:
ss = StandardScaler()
x = pd.DataFrame(ss.fit_transform(df),columns=df.columns)
x.head()

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem,symboling,...,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
0,0.328798,-0.469295,1.130388,-3.050975,1.21333,-0.121867,-2.865105,-0.147475,0.869568,1.74347,...,-2.020417,-0.014566,0.074449,0.519071,-1.839377,-0.288349,0.174483,-0.26296,-0.646553,-0.546059
1,0.328798,-0.469295,1.130388,-3.050975,1.21333,-0.121867,-2.865105,-0.147475,0.869568,1.74347,...,-2.020417,-0.014566,0.074449,0.519071,-1.839377,-0.288349,0.174483,-0.26296,-0.646553,-0.546059
2,0.328798,-0.469295,1.130388,-0.717207,1.21333,-0.121867,1.88689,1.11221,0.869568,0.133509,...,-0.543527,0.514882,0.604046,-2.40488,0.685946,-0.288349,1.264536,-0.26296,-0.953012,-0.691627
3,0.328798,-0.469295,-0.884652,0.449677,-0.589081,-0.121867,-0.013908,-0.147475,0.869568,0.93849,...,0.235942,-0.420797,-0.431076,-0.517266,0.462183,-0.035973,-0.053668,0.787855,-0.186865,-0.109354
4,0.328798,-0.469295,-0.884652,0.449677,-2.391492,-0.121867,-0.013908,-1.407161,0.869568,0.93849,...,0.235942,0.516807,0.218885,-0.517266,0.462183,-0.540725,0.275883,0.787855,-1.106241,-1.2739


<a id="5"></a>
## Train-Test Split

In [26]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=42)
print("The shape of xtrain is: ",xtrain.shape)
print("The shape of xtest is: ",xtest.shape)
print("The shape of ytrain is: ",ytrain.shape)
print("The shape of ytest is: ",ytest.shape)

The shape of xtrain is:  (143, 23)
The shape of xtest is:  (62, 23)
The shape of ytrain is:  (143,)
The shape of ytest is:  (62,)


<a id="6"></a>
## Building a Model


In [26]:
lr = LinearRegression()
lr_model = lr.fit(xtrain,ytrain)
ypred_train = lr_model.predict(xtrain)
ypred_test = lr_model.predict(xtest)

In [14]:
r2_score(ytrain,ypred_train),r2_score(ytest,ypred_test)


(0.9005689762361274, 0.8000166317722498)

In [15]:
np.sqrt(mean_squared_error(ytrain,ypred_train)),np.sqrt(mean_squared_error(ytest,ypred_test))

(2461.764170912046, 3722.3276857700134)

In [16]:
pd.DataFrame(lr_model.coef_,index=xtrain.columns,columns=['Regression Coefficients'])


Unnamed: 0,Regression Coefficients
fueltype,2386.628635
aspiration,465.550515
doornumber,-862.167864
carbody,-965.349021
drivewheel,693.168881
enginelocation,1211.46469
enginetype,227.634668
cylindernumber,-345.326005
fuelsystem,-151.072757
symboling,380.911781


In [17]:
lr_model.intercept_


13162.057239986392