In [219]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#  Read data

In [220]:
df = pd.read_csv('car.csv')

# Data Information

In [221]:
df.shape

(8128, 12)

In [222]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   km_driven      8128 non-null   int64  
 3   fuel           8128 non-null   object 
 4   seller_type    8128 non-null   object 
 5   transmission   8128 non-null   object 
 6   owner          8128 non-null   object 
 7   mileage        7907 non-null   float64
 8   engine         7907 non-null   float64
 9   max_power      7912 non-null   float64
 10  seats          7907 non-null   float64
 11  selling_price  8128 non-null   int64  
dtypes: float64(4), int64(3), object(5)
memory usage: 762.1+ KB


In [223]:
df.head()

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,selling_price
0,BMW,2014,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0,450000
1,Honda,2014,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0,370000
2,Toyota,2006,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0,158000
3,Ford,2010,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0,225000
4,Honda,2007,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0,130000


# normalize dataframe

In [224]:

scaler = StandardScaler()
scaler.fit( df[['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']])
df[['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']] = scaler.transform( df[['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats', 'selling_price']])


In [225]:
df.head(50)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats,selling_price
0,BMW,0.048464,1.338363,Diesel,Individual,Manual,First Owner,0.986209,-0.418003,-0.489051,-0.434297,-0.233529
1,Honda,0.048464,0.887411,Diesel,Individual,Manual,Second Owner,0.426372,0.078143,0.335064,-0.434297,-0.332759
2,Toyota,-1.929775,1.241098,Petrol,Individual,Manual,Third Owner,-0.425769,0.076158,-0.377382,-0.434297,-0.59572
3,Ford,-0.940656,1.011202,Diesel,Individual,Manual,First Owner,0.887123,-0.124284,-0.042376,-0.434297,-0.512615
4,Honda,-1.682495,0.887411,Petrol,Individual,Manual,First Owner,-0.822114,-0.318774,-0.092627,-0.434297,-0.630451
5,BMW,0.790304,-0.438918,Petrol,Individual,Manual,First Owner,0.178657,-0.519216,-0.269622,-0.434297,-0.245933
6,Nissan,-1.682495,1.860052,LPG,Individual,Manual,First Owner,-0.524855,-0.789119,-0.949684,-0.434297,-0.672624
7,Ford,-3.166175,-1.146293,Petrol,Individual,Manual,Second Owner,-0.822114,-1.315034,-1.521987,-1.476477,-0.735883
8,Ford,-0.693376,0.356879,Diesel,Individual,Manual,First Owner,1.033275,-0.187791,-0.681679,-0.434297,-0.357567
9,Honda,-0.198816,1.753945,Diesel,Individual,Manual,First Owner,0.143976,-0.118331,-0.653762,-0.434297,-0.543624


# Split dataframe to features and label

In [226]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [227]:
X

array([['BMW', 0.04846418771422057, 1.3383625536597623, ...,
        -0.4180026220963684, -0.4890509248380697, -0.4342967239530142],
       ['Honda', 0.04846418771422057, 0.8874108553930357, ...,
        0.07814288617965935, 0.3350642748356089, -0.4342967239530142],
       ['Toyota', -1.9297751983057385, 1.2410984618767429, ...,
        0.07615830414655525, -0.3773821985950238, -0.4342967239530142],
       ...,
       ['BMW', -1.187935428548254, 0.8874108553930357, ...,
        -0.4180026220963684, -0.4918426429941457, -0.4342967239530142],
       ['Ford', -0.1988157355382743, -0.7926052754045734, ...,
        -0.12428448119695996, -0.6007196510811156, -0.4342967239530142],
       ['Toyota', -0.1988157355382743, -0.7926052754045734, ...,
        -0.12428448119695996, -0.6007196510811156, -0.4342967239530142]],
      dtype=object)

In [228]:
Y

array([-0.2335288 , -0.33275929, -0.5957201 , ..., -0.31787472,
       -0.43198978, -0.43198978])

# impute missing values

In [229]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 7:10])
X[:, 7:10] = imputer.transform(X[:, 7:10])

In [230]:
imputer2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer2.fit(X[:, 10:11])
X[:, 10:11] = imputer2.transform(X[:, 10:11])

In [231]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,BMW,0.048464,1.338363,Diesel,Individual,Manual,First Owner,0.986209,-0.418003,-0.489051,-0.434297
1,Honda,0.048464,0.887411,Diesel,Individual,Manual,Second Owner,0.426372,0.078143,0.335064,-0.434297
2,Toyota,-1.929775,1.241098,Petrol,Individual,Manual,Third Owner,-0.425769,0.076158,-0.377382,-0.434297
3,Ford,-0.940656,1.011202,Diesel,Individual,Manual,First Owner,0.887123,-0.124284,-0.042376,-0.434297
4,Honda,-1.682495,0.887411,Petrol,Individual,Manual,First Owner,-0.822114,-0.318774,-0.092627,-0.434297
...,...,...,...,...,...,...,...,...,...,...,...
8123,Ford,-0.198816,0.710567,Petrol,Individual,Manual,First Owner,-0.227597,-0.519216,-0.241984,-0.434297
8124,Nissan,-1.682495,0.869726,Diesel,Individual,Manual,Fourth & Above Owner,-0.648713,0.06822,0.515968,-0.434297
8125,BMW,-1.187935,0.887411,Diesel,Individual,Manual,First Owner,-0.029424,-0.418003,-0.491843,-0.434297
8126,Ford,-0.198816,-0.792605,Diesel,Individual,Manual,First Owner,1.028321,-0.124284,-0.60072,-0.434297


In [232]:
pd.DataFrame(X).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8128 non-null   object
 1   1       8128 non-null   object
 2   2       8128 non-null   object
 3   3       8128 non-null   object
 4   4       8128 non-null   object
 5   5       8128 non-null   object
 6   6       8128 non-null   object
 7   7       8128 non-null   object
 8   8       8128 non-null   object
 9   9       8128 non-null   object
 10  10      8128 non-null   object
dtypes: object(11)
memory usage: 698.6+ KB


# encode catagorical columns

In [233]:
df['name'].value_counts()

Toyota    1686
Ford      1666
Nissan    1620
Honda     1608
BMW       1548
Name: name, dtype: int64

In [234]:
df['fuel'].value_counts()

Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64

In [235]:
df['transmission'].value_counts()

Manual       7078
Automatic    1050
Name: transmission, dtype: int64

In [236]:
df['seller_type'].value_counts()

Individual          6766
Dealer              1126
Trustmark Dealer     236
Name: seller_type, dtype: int64

In [237]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

In [238]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 3, 4,5, 6])], remainder='passthrough')
X = np.array((ct.fit_transform(X)))

In [239]:
# X = pd.DataFrame(X, columns=('BMW', 'Ford', 'Honda', 'Nissan', 'Toyota', 'CNG', 'Diesel', 'LPG', 'Petrol', 'Dealer', 'Individual', 'Trustmark_Dealer', 'First_Owner', 'Fourth_Above Owner', 'Second_Owner', 'Test_Drive_Car', 'Third_Owner', 'year', 'km_driven', 'transmission', 'mileage', 'engine', 'max_power', 'seats'))
# Y = pd.DataFrame(Y)

In [240]:
pd.DataFrame(X).loc[:, 19]

0       0.048464
1       0.048464
2      -1.929775
3      -0.940656
4      -1.682495
          ...   
8123   -0.198816
8124   -1.682495
8125   -1.187935
8126   -0.198816
8127   -0.198816
Name: 19, Length: 8128, dtype: object

# split data to train and test

In [241]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [242]:
X_train.shape

(6502, 25)

In [243]:
X_test.shape

(1626, 25)

# Training Data

In [244]:
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

LinearRegression()

In [245]:
training_acc = np.ceil(regressor.score(X_train, Y_train) * 100)
print("The Training Accuracy = ", training_acc, "%")

The Training Accuracy =  70.0 %


In [246]:
testing_acc = np.ceil(regressor.score(X_test, Y_test) * 100)
print("The Testing_acc Accuracy = ", testing_acc, "%")

The Testing_acc Accuracy =  65.0 %
