## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('CarPrice_Assignment.csv')

In [3]:
dataset.shape

(205, 26)

In [4]:
dataset.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


## Removing all useless columns

In [5]:
dataset.drop(columns=['car_ID', 'CarName'], inplace=True)

In [6]:
dataset.shape

(205, 24)

In [7]:
dataset.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


## The number of missing values in each column

In [8]:
dataset.isna().sum()

symboling           0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

## The number of unique values in each column

In [9]:
dataset.nunique()

symboling             6
fueltype              2
aspiration            2
doornumber            2
carbody               5
drivewheel            3
enginelocation        2
wheelbase            53
carlength            75
carwidth             44
carheight            49
curbweight          171
enginetype            7
cylindernumber        7
enginesize           44
fuelsystem            8
boreratio            38
stroke               37
compressionratio     32
horsepower           59
peakrpm              23
citympg              29
highwaympg           30
price               189
dtype: int64

## The type of each column

In [10]:
dataset.dtypes

symboling             int64
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object

## The categorical values in each categorical column

In [11]:
index_categorical_columns = []
for i, column in enumerate(dataset.columns):
  if dataset[column].dtype.name == 'object':
    print(column, ": ", dataset[column].unique())
    index_categorical_columns.append(i)

fueltype :  ['gas' 'diesel']
aspiration :  ['std' 'turbo']
doornumber :  ['two' 'four']
carbody :  ['convertible' 'hatchback' 'sedan' 'wagon' 'hardtop']
drivewheel :  ['rwd' 'fwd' '4wd']
enginelocation :  ['front' 'rear']
enginetype :  ['dohc' 'ohcv' 'ohc' 'l' 'rotor' 'ohcf' 'dohcv']
cylindernumber :  ['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']
fuelsystem :  ['mpfi' '2bbl' 'mfi' '1bbl' 'spfi' '4bbl' 'idi' 'spdi']


In [12]:
print(index_categorical_columns)

[1, 2, 3, 4, 5, 6, 12, 13, 15]


## The independent variables and the dependent variable

In [13]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [14]:
print(X.shape)
print(y.shape)

(205, 23)
(205,)


In [15]:
print(X)

[[3 'gas' 'std' ... 5000 21 27]
 [3 'gas' 'std' ... 5000 21 27]
 [1 'gas' 'std' ... 5000 19 26]
 ...
 [-1 'gas' 'std' ... 5500 18 23]
 [-1 'diesel' 'turbo' ... 4800 26 27]
 [-1 'gas' 'turbo' ... 5400 19 25]]


## Encoding categorical columns

In [16]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), index_categorical_columns)], remainder="passthrough")
X = ct.fit_transform(X)

In [17]:
print(X.shape)

(205, 52)


In [18]:
print(X)

[[0.0 1.0 1.0 ... 5000 21 27]
 [0.0 1.0 1.0 ... 5000 21 27]
 [0.0 1.0 1.0 ... 5000 19 26]
 ...
 [0.0 1.0 1.0 ... 5500 18 23]
 [1.0 0.0 0.0 ... 4800 26 27]
 [0.0 1.0 0.0 ... 5400 19 25]]


## Spliting the dataset into the training set and the test set

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(164, 52)
(41, 52)
(164,)
(41,)


## Feature scaling

In [21]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
y_train = sc_y.fit_transform(y_train.reshape(-1,1))

## Training the Multiple linear regression model on the training set

In [22]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Evaluating the model performance on the test set

In [23]:
from sklearn.metrics import r2_score
score = r2_score(y_test, sc_y.inverse_transform(regressor.predict(X_test)))
print(score)

0.8493746738819721
