Import Important libraries

In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Read the data

In [49]:
df = pd.read_csv("car.csv")
df.head()

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,selling_price
0,BMW,2014,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,450000
1,Honda,2014,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,370000
2,Toyota,2006,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,158000
3,Ford,2010,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0,225000
4,Honda,2007,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0,130000


Split data to Features(x) and label(y)

In [50]:
x = df.iloc[:, :-1].values # Features: Extracting all rows for all columns except the last one (label) 
y = df.iloc[:, -1].values # Label: Extracting all rows for the last column (label)
print('col   null count')
pd.DataFrame(x).isnull().sum() # Display columns with missing data 

col   null count


0       0
1       0
2       0
3       0
4       0
5       0
6       0
7     221
8     221
9     216
10    221
dtype: int64

Take care of Missing data

In [51]:
from sklearn.impute import SimpleImputer

missing_cols = range(7, 11) 
most_freq_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
most_freq_imputer.fit(x[:, missing_cols])
x[:, missing_cols] = most_freq_imputer.transform(x[:, missing_cols])
print('col   null count')
pd.DataFrame(x).isnull().sum()

col   null count


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64

Encoding the Independent Variable

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

category_cols = [0, 3, 4, 5, 6]
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), category_cols)], remainder='passthrough')
x = np.array(ct.fit_transform(x))
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2014,145500,23.4,1248.0,74.0,5.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2014,120000,21.14,1498.0,103.52,5.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,2006,140000,17.7,1497.0,78.0,5.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2010,127000,23.0,1396.0,90.0,5.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2007,120000,16.1,1298.0,88.2,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,2013,110000,18.5,1197.0,82.85,5.0
8124,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,2007,119000,16.8,1493.0,110.0,5.0
8125,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2009,120000,19.3,1248.0,73.9,5.0
8126,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2013,25000,23.57,1396.0,70.0,5.0


Splitting the dataset into training set and test set

In [53]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y ,test_size=0.2, random_state=1)

In [54]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)
training_acc = np.ceil(regressor.score(x_train, y_train) * 100)
print("The Training Accuracy = ", training_acc, "%")

testing_acc = np.ceil(regressor.score(x_test, y_test) * 100)
print("The Testing_acc Accuracy = ", testing_acc, "%")

The Training Accuracy =  70.0 %
The Testing_acc Accuracy =  64.0 %
