### Data description
| Attribute          | Description                                                             |
|--------------------|-------------------------------------------------------------------------|
| Name               | The brand and model of the car                                          |
| Location           | The location in which the car is being sold or is available for purchase|
| Year               | The year or edition of the model                                        |
| Kilometers_Driven  | The total kilometers driven in the car by the previous owner(s) in KM   |
| Fuel_Type          | The type of fuel used by the car                                        |
| Transmission       | The type of transmission used by the car                                |
| Owner_Type         | Whether the ownership is Firsthand, Second hand or other                |
| Mileage            | The standard mileage offered by the car company in kmpl or km/kg        |
| Engine             | The displacement volume of the engine in cc                             |
| Power              | The maximum power of the engine in bhp                                  |
| Seats              | The number of seats in the car                                          |
| New_Price          | Price of new model                                                      |
| Price              | The price of the used car in INR Lakhs                                  |


# Data Understanding

In [725]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

In [726]:
# load dataset
df = pd.read_csv('train.csv')
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,7.88 Lakh,4.75
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,,4.00
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,,2.90
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,,2.65


In [727]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 611.4+ KB


In [728]:
# Check numerical columns statistics
df.describe()

Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,5977.0,6019.0
mean,2013.358199,58738.38,5.278735,9.479468
std,3.269742,91268.84,0.80884,11.187917
min,1998.0,171.0,0.0,0.44
25%,2011.0,34000.0,5.0,3.5
50%,2014.0,53000.0,5.0,5.64
75%,2016.0,73000.0,5.0,9.95
max,2019.0,6500000.0,10.0,160.0


In [729]:
# assuming kmpl = 0.8 km/kg

df.Mileage

0       26.6 km/kg
1       19.67 kmpl
2        18.2 kmpl
3       20.77 kmpl
4        15.2 kmpl
           ...    
6014     28.4 kmpl
6015     24.4 kmpl
6016     14.0 kmpl
6017     18.9 kmpl
6018    25.44 kmpl
Name: Mileage, Length: 6019, dtype: object

In [730]:
for i in df.Mileage:
    print(i)

26.6 km/kg
19.67 kmpl
18.2 kmpl
20.77 kmpl
15.2 kmpl
21.1 km/kg
23.08 kmpl
11.36 kmpl
20.54 kmpl
22.3 kmpl
21.56 kmpl
16.8 kmpl
25.2 kmpl
12.7 kmpl
0.0 kmpl
13.5 kmpl
25.8 kmpl
28.4 kmpl
20.45 kmpl
14.84 kmpl
22.69 kmpl
23.65 kmpl
13.53 kmpl
18.5 kmpl
14.4 kmpl
16.8 kmpl
23.08 kmpl
20.92 kmpl
17.5 kmpl
12.8 kmpl
19.01 kmpl
14.53 kmpl
11.18 kmpl
12.4 kmpl
16.09 kmpl
14.0 kmpl
24.3 kmpl
18.15 kmpl
11.74 kmpl
22.07 kmpl
19.7 kmpl
25.4 kmpl
25.32 kmpl
18.5 kmpl
14.62 kmpl
14.28 kmpl
14.9 kmpl
11.25 kmpl
24.4 kmpl
16.55 kmpl
17.11 kmpl
22.9 kmpl
17.8 kmpl
28.4 kmpl
18.9 kmpl
15.04 kmpl
25.17 kmpl
20.36 kmpl
13.29 kmpl
18.2 kmpl
13.68 kmpl
20.0 kmpl
11.74 kmpl
15.8 kmpl
25.0 kmpl
16.55 kmpl
16.4 kmpl
0.0 kmpl
24.52 kmpl
22.1 kmpl
8.5 kmpl
15.1 kmpl
16.95 kmpl
19.64 kmpl
16.5 kmpl
18.53 kmpl
17.8 kmpl
12.4 kmpl
12.8 kmpl
0.0 kmpl
22.9 kmpl
17.57 kmpl
18.0 kmpl
20.0 kmpl
23.2 kmpl
17.8 kmpl
16.73 kmpl
20.36 kmpl
18.9 kmpl
17.0 kmpl
17.8 kmpl
13.0 kmpl
17.68 kmpl
22.7 kmpl
15.1 kmpl
25.8 kmpl
2

In [731]:
# Create function to convert from kmpl to km/kg and remove any string like 'kmpl' or 'km/kg'

def mileage_cleaning(x):

    x = str(x)
    if 'kmpl' in x:

        x = x.split()[0]
        x = float(x)
        x = x * 0.8
        return x
    
    elif 'km/kg' in x:

        x = str(x).split()[0]
        x = float(x)
        return x

# This code apply the function on the column just on the fly, but not saved in the dataframe
df.Mileage.apply(mileage_cleaning)

0       26.600
1       15.736
2       14.560
3       16.616
4       12.160
         ...  
6014    22.720
6015    19.520
6016    11.200
6017    15.120
6018    20.352
Name: Mileage, Length: 6019, dtype: float64

In [732]:
# This code saves the changes to the column in the dataframe
df.Mileage = df.Mileage.apply(mileage_cleaning)

In [733]:
# Create function for both columns `Engine` and `Power` to return only the numeric values

def col_cleaning(x):

    x = str(x)
    if 'null' not in x:
        x = x.split()[0]
        x = float(x)
        return x

    else:
        None

df.Engine = df.Engine.apply(col_cleaning)

df.Power = df.Power.apply(col_cleaning)

In [734]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.600,998.0,58.16,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,15.736,1582.0,126.20,5.0,,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,14.560,1199.0,88.70,5.0,8.61 Lakh,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,16.616,1248.0,88.76,7.0,,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,12.160,1968.0,140.80,5.0,,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,22.720,1248.0,74.00,5.0,7.88 Lakh,4.75
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,19.520,1120.0,71.00,5.0,,4.00
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,11.200,2498.0,112.00,8.0,,2.90
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,15.120,998.0,67.10,5.0,,2.65


In [735]:
# Check statistics for numerical features

df.describe()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
count,6019.0,6019.0,6017.0,5983.0,5876.0,5977.0,6019.0
mean,2013.358199,58738.38,14.561725,1621.27645,113.25305,5.278735,9.479468
std,3.269742,91268.84,3.794378,601.355233,53.874957,0.80884,11.187917
min,1998.0,171.0,0.0,72.0,34.2,0.0,0.44
25%,2011.0,34000.0,12.208,1198.0,75.0,5.0,3.5
50%,2014.0,53000.0,14.528,1493.0,97.7,5.0,5.64
75%,2016.0,73000.0,16.88,1984.0,138.1,5.0,9.95
max,2019.0,6500000.0,33.54,5998.0,560.0,10.0,160.0


In [736]:
# Check statistics for categorical features

df.describe(include= 'object')

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,New_Price
count,6019,6019,6019,6019,6019,824
unique,1876,11,5,2,4,540
top,Mahindra XUV500 W8 2WD,Mumbai,Diesel,Manual,First,95.13 Lakh
freq,49,790,3205,4299,4929,6


### Data Cleaning

In [737]:
for col in df.columns:
    fig = px.histogram(df, x= col)
    fig.show()

In [738]:
# Remove this outlier row from Kilometers_Driven column due to inconsistency
error_idx = df.Kilometers_Driven[df.Kilometers_Driven == 6.500000e+06].index[0]
error_idx

2328

In [739]:
# After geting the index of the row, we drop it and save the dataframe
df = df.drop(error_idx, axis= 0)
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.600,998.0,58.16,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,15.736,1582.0,126.20,5.0,,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,14.560,1199.0,88.70,5.0,8.61 Lakh,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,16.616,1248.0,88.76,7.0,,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,12.160,1968.0,140.80,5.0,,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,22.720,1248.0,74.00,5.0,7.88 Lakh,4.75
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,19.520,1120.0,71.00,5.0,,4.00
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,11.200,2498.0,112.00,8.0,,2.90
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,15.120,998.0,67.10,5.0,,2.65


### Check Missing values

In [740]:
# Check missing values as percentage
round(df.isna().mean() * 100, 2)

Name                  0.00
Location              0.00
Year                  0.00
Kilometers_Driven     0.00
Fuel_Type             0.00
Transmission          0.00
Owner_Type            0.00
Mileage               0.03
Engine                0.60
Power                 2.38
Seats                 0.70
New_Price            86.31
Price                 0.00
dtype: float64

In [741]:
# Drop New_Price column due to extreme percentage of Missing Values (If you filled 86% percent of the column by guessing, the data will be meaningless)
df = df.drop('New_Price', axis= 1)
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.600,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,15.736,1582.0,126.20,5.0,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,14.560,1199.0,88.70,5.0,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,16.616,1248.0,88.76,7.0,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,12.160,1968.0,140.80,5.0,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,22.720,1248.0,74.00,5.0,4.75
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,19.520,1120.0,71.00,5.0,4.00
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,11.200,2498.0,112.00,8.0,2.90
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,15.120,998.0,67.10,5.0,2.65


In [742]:
df.Name.value_counts().shape[0] / df.shape[0] * 100

31.173147224991695

In [743]:
df.Location.value_counts(normalize= True) * 100

Location
Mumbai        13.127285
Hyderabad     12.329678
Kochi         10.817547
Coimbatore    10.568295
Pune          10.335660
Delhi          9.205716
Kolkata        8.889997
Chennai        8.192090
Jaipur         6.862745
Bangalore      5.948820
Ahmedabad      3.722167
Name: proportion, dtype: float64

In [744]:
df.Year.value_counts()

Year
2014    797
2015    744
2016    741
2013    649
2017    586
2012    580
2011    466
2010    342
2018    298
2009    198
2008    174
2007    125
2019    102
2006     78
2005     57
2004     31
2003     17
2002     15
2001      8
2000      4
1998      4
1999      2
Name: count, dtype: int64

In [745]:
df.Year = df.Year.apply(lambda x : 2005 if x <= 2005 else x)

In [746]:
px.histogram(df, x= 'Kilometers_Driven')

In [747]:
df.Fuel_Type.value_counts()

Fuel_Type
Diesel      3204
Petrol      2746
CNG           56
LPG           10
Electric       2
Name: count, dtype: int64

In [748]:
fuel_idx = df[(df.Fuel_Type == 'LPG') | (df.Fuel_Type == 'Electric')].index
fuel_idx

Index([5, 936, 987, 2278, 2385, 2436, 2941, 3595, 4446, 4904, 5506, 5997], dtype='int64')

In [749]:
df.drop(fuel_idx, axis= 0, inplace= True)

df.reset_index(inplace= True, drop= True)

In [750]:
df.Fuel_Type.value_counts()

Fuel_Type
Diesel    3204
Petrol    2746
CNG         56
Name: count, dtype: int64

In [751]:
df.Transmission.value_counts()

Transmission
Manual       4289
Automatic    1717
Name: count, dtype: int64

In [752]:
df.Owner_Type.value_counts(normalize= True) * 100

Owner_Type
First             81.884782
Second            16.083916
Third              1.881452
Fourth & Above     0.149850
Name: proportion, dtype: float64

In [753]:
def owner_func(x):

    if x == 'Fourth & Above' or x == 'Third':
        return 'Third & Above'
    else:
        return x
    
df.Owner_Type = df.Owner_Type.apply(owner_func)

In [754]:
df.Seats.value_counts()[df.Seats.value_counts() < 0.01 * df.shape[0]]

Seats
6.0     31
2.0     16
10.0     5
9.0      3
0.0      1
Name: count, dtype: int64

In [755]:
df[df.Seats == 0]

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
3990,Audi A4 3.2 FSI Tiptronic Quattro,Hyderabad,2012,125000,Petrol,Automatic,First,8.4,3197.0,,0.0,18.0


In [756]:
df.drop(3990, axis= 0, inplace= True)

In [757]:
seats_idx = df[(df.Seats == 6) | (df.Seats == 2) | (df.Seats == 10) | (df.Seats == 9)].index
seats_idx

Index([  41,   48,  133,  556,  596,  692,  797,  813,  914,  916,  925,  949,
       1075, 1285, 1344, 1539, 1904, 1906, 2050, 2092, 2100, 2264, 2301, 2302,
       2308, 2354, 2553, 2568, 2689, 2786, 3357, 3425, 3495, 3499, 3505, 3561,
       4099, 4111, 4191, 4320, 4448, 4559, 4681, 4712, 4801, 4883, 5283, 5507,
       5541, 5724, 5769, 5795, 5848, 5907, 5931],
      dtype='int64')

In [758]:
df.drop(seats_idx, axis= 0, inplace= True)

df.reset_index(inplace= True, drop= True)

In [759]:
df.Seats.value_counts()

Seats
5.0    5001
7.0     674
8.0     134
4.0      99
Name: count, dtype: int64

In [760]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.600,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,15.736,1582.0,126.20,5.0,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,14.560,1199.0,88.70,5.0,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,16.616,1248.0,88.76,7.0,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,12.160,1968.0,140.80,5.0,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...
5945,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,22.720,1248.0,74.00,5.0,4.75
5946,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,19.520,1120.0,71.00,5.0,4.00
5947,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,11.200,2498.0,112.00,8.0,2.90
5948,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,15.120,998.0,67.10,5.0,2.65


In [761]:
df.duplicated().sum()

0

In [762]:
df['Brand'] = df.Name.str.split().str[0]
df['Brand']

0          Maruti
1         Hyundai
2           Honda
3          Maruti
4            Audi
          ...    
5945       Maruti
5946      Hyundai
5947     Mahindra
5948       Maruti
5949    Chevrolet
Name: Brand, Length: 5950, dtype: object

In [763]:
df

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.600,998.0,58.16,5.0,1.75,Maruti
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,15.736,1582.0,126.20,5.0,12.50,Hyundai
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,14.560,1199.0,88.70,5.0,4.50,Honda
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,16.616,1248.0,88.76,7.0,6.00,Maruti
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,12.160,1968.0,140.80,5.0,17.74,Audi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5945,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,22.720,1248.0,74.00,5.0,4.75,Maruti
5946,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,19.520,1120.0,71.00,5.0,4.00,Hyundai
5947,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,11.200,2498.0,112.00,8.0,2.90,Mahindra
5948,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,15.120,998.0,67.10,5.0,2.65,Maruti


In [764]:
df.Brand.value_counts()[df.Brand.value_counts() < 0.01 * df.shape[0]].index

Index(['Land', 'Jaguar', 'Fiat', 'Mini', 'Volvo', 'Porsche', 'Jeep',
       'Mitsubishi', 'Datsun', 'Force', 'ISUZU', 'Ambassador', 'Isuzu',
       'Bentley'],
      dtype='object', name='Brand')

In [765]:
def brand_func(x):

    if x in ['Land', 'Jaguar', 'Fiat', 'Mini', 'Volvo', 'Porsche', 'Jeep','Mitsubishi', 'Datsun', 'Force', 'ISUZU', 'Ambassador', 'Isuzu','Bentley']:
        return 'Other'
    else:
        return x
    
df.Brand.apply(brand_func).value_counts(normalize= True) * 100

Brand
Maruti           20.268908
Hyundai          18.521008
Honda            10.218487
Toyota            6.840336
Volkswagen        5.294118
Mercedes-Benz     5.243697
Ford              5.042017
BMW               4.436975
Mahindra          4.235294
Other             4.000000
Audi              3.915966
Tata              3.109244
Skoda             2.907563
Renault           2.436975
Chevrolet         2.000000
Nissan            1.529412
Name: proportion, dtype: float64

In [766]:
df.Brand = df.Brand.apply(brand_func)

In [767]:
df.drop('Name', axis= 1, inplace= True)

In [768]:
df.duplicated().sum()

2

In [769]:
df.drop_duplicates(inplace= True)

df.reset_index(inplace= True, drop= True)

In [770]:
df.duplicated().sum()

0

In [771]:
df.isna().sum()

Location               0
Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage                0
Engine                36
Power                136
Seats                 42
Price                  0
Brand                  0
dtype: int64

In [772]:
df.isna().mean() * 100

Location             0.000000
Year                 0.000000
Kilometers_Driven    0.000000
Fuel_Type            0.000000
Transmission         0.000000
Owner_Type           0.000000
Mileage              0.000000
Engine               0.605245
Power                2.286483
Seats                0.706120
Price                0.000000
Brand                0.000000
dtype: float64

In [773]:
df.dropna().shape[0] / df.shape[0] * 100

97.67989240080699

In [774]:
df.dropna(inplace= True)

df.reset_index(inplace= True, drop= True)

In [775]:
df

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand
0,Mumbai,2010,72000,CNG,Manual,First,26.600,998.0,58.16,5.0,1.75,Maruti
1,Pune,2015,41000,Diesel,Manual,First,15.736,1582.0,126.20,5.0,12.50,Hyundai
2,Chennai,2011,46000,Petrol,Manual,First,14.560,1199.0,88.70,5.0,4.50,Honda
3,Chennai,2012,87000,Diesel,Manual,First,16.616,1248.0,88.76,7.0,6.00,Maruti
4,Coimbatore,2013,40670,Diesel,Automatic,Second,12.160,1968.0,140.80,5.0,17.74,Audi
...,...,...,...,...,...,...,...,...,...,...,...,...
5805,Delhi,2014,27365,Diesel,Manual,First,22.720,1248.0,74.00,5.0,4.75,Maruti
5806,Jaipur,2015,100000,Diesel,Manual,First,19.520,1120.0,71.00,5.0,4.00,Hyundai
5807,Jaipur,2012,55000,Diesel,Manual,Second,11.200,2498.0,112.00,8.0,2.90,Mahindra
5808,Kolkata,2013,46000,Petrol,Manual,First,15.120,998.0,67.10,5.0,2.65,Maruti


In [776]:
df.isna().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
Brand                0
dtype: int64

### Data Preprocessing

### Split Data into Input Features and Target Variable

In [777]:
x = df.drop('Price', axis= 1)
y = df['Price']

### Split Data into Train and Test

In [778]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 0)

In [779]:
x_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand
3095,Delhi,2015,62088,Petrol,Manual,First,14.880,1197.0,85.80,5.0,Maruti
4278,Chennai,2013,107000,Petrol,Manual,Second,13.560,1498.0,97.70,5.0,Nissan
1581,Bangalore,2011,94821,Diesel,Automatic,Second,16.000,1968.0,138.10,5.0,Skoda
3188,Pune,2017,40158,Diesel,Manual,First,15.736,1582.0,126.20,5.0,Hyundai
4612,Delhi,2013,66314,Diesel,Manual,First,15.896,1461.0,83.80,5.0,Renault
...,...,...,...,...,...,...,...,...,...,...,...
4931,Jaipur,2015,55697,Diesel,Manual,First,18.032,1396.0,88.73,5.0,Hyundai
3264,Mumbai,2014,58002,Diesel,Manual,First,15.712,1461.0,108.45,5.0,Renault
1653,Chennai,2010,160000,Diesel,Manual,Second,15.200,1248.0,93.00,5.0,Other
2607,Pune,2016,6000,Petrol,Manual,First,13.752,1197.0,81.86,5.0,Hyundai


In [780]:
x_test

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand
2772,Pune,2011,51482,Diesel,Manual,First,12.336,1405.0,70.00,8.0,Tata
5113,Coimbatore,2013,38995,Diesel,Manual,First,15.440,1248.0,73.90,5.0,Maruti
3791,Pune,2013,95500,Diesel,Manual,First,16.432,1598.0,103.60,5.0,Volkswagen
3669,Kolkata,2014,45000,Diesel,Manual,First,19.360,1498.0,98.60,7.0,Honda
3889,Kochi,2015,35314,Diesel,Manual,First,18.560,1248.0,73.94,5.0,Maruti
...,...,...,...,...,...,...,...,...,...,...,...
4265,Pune,2014,90000,Petrol,Manual,Second,16.288,1197.0,78.90,5.0,Hyundai
3482,Coimbatore,2018,20632,Petrol,Manual,First,14.880,1197.0,81.83,5.0,Hyundai
3705,Ahmedabad,2011,70000,Diesel,Manual,Second,9.640,2179.0,120.00,8.0,Mahindra
3638,Hyderabad,2008,90000,Diesel,Manual,Second,13.440,1493.0,110.00,5.0,Hyundai


In [781]:
y_train

3095     4.15
4278     2.90
1581     6.75
3188    13.75
4612     5.50
        ...  
4931     5.50
3264     5.75
1653     2.50
2607     7.25
2732     5.18
Name: Price, Length: 4648, dtype: float64

In [782]:
y_test

2772     2.50
5113     6.46
3791     5.00
3669     5.00
3889     4.27
        ...  
4265     2.80
3482     7.58
3705     7.45
3638     2.25
899     20.00
Name: Price, Length: 1162, dtype: float64

#### Preprocessing for Numerical Features

In [783]:
# Scaling
scaling_cols = x_train.select_dtypes(exclude= 'object').columns
scaling_cols

Index(['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats'], dtype='object')

In [784]:
from sklearn.preprocessing import RobustScaler

rc = RobustScaler()

x_train[scaling_cols] = rc.fit_transform(x_train[scaling_cols])

x_test[scaling_cols] = rc.transform(x_test[scaling_cols])

In [785]:
x_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand
3095,Delhi,0.25,0.242041,Petrol,Manual,First,0.017241,-0.384416,-0.127183,0.0,Maruti
4278,Chennai,-0.25,1.377523,Petrol,Manual,Second,-0.267241,0.006494,0.064397,0.0,Nissan
1581,Bangalore,-0.75,1.069609,Diesel,Automatic,Second,0.258621,0.616883,0.714803,0.0,Skoda
3188,Pune,0.75,-0.312402,Diesel,Manual,First,0.201724,0.115584,0.523223,0.0,Hyundai
4612,Delhi,-0.25,0.348884,Diesel,Manual,First,0.236207,-0.041558,-0.159382,0.0,Renault
...,...,...,...,...,...,...,...,...,...,...,...
4931,Jaipur,0.25,0.080461,Diesel,Manual,First,0.696552,-0.125974,-0.080013,0.0,Hyundai
3264,Mumbai,0.00,0.138737,Diesel,Manual,First,0.196552,-0.041558,0.237463,0.0,Renault
1653,Chennai,-1.00,2.717488,Diesel,Manual,Second,0.086207,-0.318182,-0.011269,0.0,Other
2607,Pune,0.50,-1.175997,Petrol,Manual,First,-0.225862,-0.384416,-0.190614,0.0,Hyundai


### Handle Categorical Features

#### Handling Categorical (Nominal)

In [786]:
x_train.Location.value_counts()

Location
Mumbai        621
Hyderabad     564
Kochi         505
Coimbatore    491
Pune          476
Delhi         426
Kolkata       411
Chennai       375
Jaipur        320
Bangalore     275
Ahmedabad     184
Name: count, dtype: int64

In [787]:
x_train.Fuel_Type.value_counts()

Fuel_Type
Diesel    2473
Petrol    2132
CNG         43
Name: count, dtype: int64

In [788]:
x_train.Brand.value_counts()

Brand
Maruti           953
Hyundai          841
Honda            487
Toyota           319
Volkswagen       252
Mercedes-Benz    240
Ford             232
BMW              195
Mahindra         190
Audi             186
Other            182
Skoda            147
Tata             144
Renault          118
Chevrolet         86
Nissan            76
Name: count, dtype: int64

In [789]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first', sparse_output= False)

x_train_ohe = ohe.fit_transform(x_train[['Location', 'Fuel_Type', 'Transmission']])

x_test_ohe = ohe.transform(x_test[['Location', 'Fuel_Type', 'Transmission']])

In [790]:
x_train_ohe

array([[0., 0., 0., ..., 0., 1., 1.],
       [0., 1., 0., ..., 0., 1., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [791]:
x_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand
3095,Delhi,0.25,0.242041,Petrol,Manual,First,0.017241,-0.384416,-0.127183,0.0,Maruti
4278,Chennai,-0.25,1.377523,Petrol,Manual,Second,-0.267241,0.006494,0.064397,0.0,Nissan
1581,Bangalore,-0.75,1.069609,Diesel,Automatic,Second,0.258621,0.616883,0.714803,0.0,Skoda
3188,Pune,0.75,-0.312402,Diesel,Manual,First,0.201724,0.115584,0.523223,0.0,Hyundai
4612,Delhi,-0.25,0.348884,Diesel,Manual,First,0.236207,-0.041558,-0.159382,0.0,Renault
...,...,...,...,...,...,...,...,...,...,...,...
4931,Jaipur,0.25,0.080461,Diesel,Manual,First,0.696552,-0.125974,-0.080013,0.0,Hyundai
3264,Mumbai,0.00,0.138737,Diesel,Manual,First,0.196552,-0.041558,0.237463,0.0,Renault
1653,Chennai,-1.00,2.717488,Diesel,Manual,Second,0.086207,-0.318182,-0.011269,0.0,Other
2607,Pune,0.50,-1.175997,Petrol,Manual,First,-0.225862,-0.384416,-0.190614,0.0,Hyundai


In [792]:
x_train_ohe = pd.DataFrame(x_train_ohe, columns= ohe.get_feature_names_out())

x_test_ohe = pd.DataFrame(x_test_ohe, columns= ohe.get_feature_names_out())

In [793]:
x_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand
3095,Delhi,0.25,0.242041,Petrol,Manual,First,0.017241,-0.384416,-0.127183,0.0,Maruti
4278,Chennai,-0.25,1.377523,Petrol,Manual,Second,-0.267241,0.006494,0.064397,0.0,Nissan
1581,Bangalore,-0.75,1.069609,Diesel,Automatic,Second,0.258621,0.616883,0.714803,0.0,Skoda
3188,Pune,0.75,-0.312402,Diesel,Manual,First,0.201724,0.115584,0.523223,0.0,Hyundai
4612,Delhi,-0.25,0.348884,Diesel,Manual,First,0.236207,-0.041558,-0.159382,0.0,Renault
...,...,...,...,...,...,...,...,...,...,...,...
4931,Jaipur,0.25,0.080461,Diesel,Manual,First,0.696552,-0.125974,-0.080013,0.0,Hyundai
3264,Mumbai,0.00,0.138737,Diesel,Manual,First,0.196552,-0.041558,0.237463,0.0,Renault
1653,Chennai,-1.00,2.717488,Diesel,Manual,Second,0.086207,-0.318182,-0.011269,0.0,Other
2607,Pune,0.50,-1.175997,Petrol,Manual,First,-0.225862,-0.384416,-0.190614,0.0,Hyundai


In [794]:
x_train_ohe

Unnamed: 0,Location_Bangalore,Location_Chennai,Location_Coimbatore,Location_Delhi,Location_Hyderabad,Location_Jaipur,Location_Kochi,Location_Kolkata,Location_Mumbai,Location_Pune,Fuel_Type_Diesel,Fuel_Type_Petrol,Transmission_Manual
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4643,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4645,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [795]:
x_train.reset_index(drop= True, inplace= True)

x_test.reset_index(drop= True, inplace= True)

In [796]:
x_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Brand
0,Delhi,0.25,0.242041,Petrol,Manual,First,0.017241,-0.384416,-0.127183,0.0,Maruti
1,Chennai,-0.25,1.377523,Petrol,Manual,Second,-0.267241,0.006494,0.064397,0.0,Nissan
2,Bangalore,-0.75,1.069609,Diesel,Automatic,Second,0.258621,0.616883,0.714803,0.0,Skoda
3,Pune,0.75,-0.312402,Diesel,Manual,First,0.201724,0.115584,0.523223,0.0,Hyundai
4,Delhi,-0.25,0.348884,Diesel,Manual,First,0.236207,-0.041558,-0.159382,0.0,Renault
...,...,...,...,...,...,...,...,...,...,...,...
4643,Jaipur,0.25,0.080461,Diesel,Manual,First,0.696552,-0.125974,-0.080013,0.0,Hyundai
4644,Mumbai,0.00,0.138737,Diesel,Manual,First,0.196552,-0.041558,0.237463,0.0,Renault
4645,Chennai,-1.00,2.717488,Diesel,Manual,Second,0.086207,-0.318182,-0.011269,0.0,Other
4646,Pune,0.50,-1.175997,Petrol,Manual,First,-0.225862,-0.384416,-0.190614,0.0,Hyundai


In [797]:
x_train = pd.concat([x_train, x_train_ohe], axis= 1).drop(['Location', 'Fuel_Type', 'Transmission'], axis= 1)

x_test = pd.concat([x_test, x_test_ohe], axis= 1).drop(['Location', 'Fuel_Type', 'Transmission'], axis= 1)

In [798]:
x_train.duplicated().sum()

11

In [799]:
x_train

Unnamed: 0,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,Brand,Location_Bangalore,Location_Chennai,...,Location_Delhi,Location_Hyderabad,Location_Jaipur,Location_Kochi,Location_Kolkata,Location_Mumbai,Location_Pune,Fuel_Type_Diesel,Fuel_Type_Petrol,Transmission_Manual
0,0.25,0.242041,First,0.017241,-0.384416,-0.127183,0.0,Maruti,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,-0.25,1.377523,Second,-0.267241,0.006494,0.064397,0.0,Nissan,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,-0.75,1.069609,Second,0.258621,0.616883,0.714803,0.0,Skoda,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.75,-0.312402,First,0.201724,0.115584,0.523223,0.0,Hyundai,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
4,-0.25,0.348884,First,0.236207,-0.041558,-0.159382,0.0,Renault,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4643,0.25,0.080461,First,0.696552,-0.125974,-0.080013,0.0,Hyundai,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4644,0.00,0.138737,First,0.196552,-0.041558,0.237463,0.0,Renault,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4645,-1.00,2.717488,Second,0.086207,-0.318182,-0.011269,0.0,Other,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4646,0.50,-1.175997,First,-0.225862,-0.384416,-0.190614,0.0,Hyundai,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [800]:
# ! pip install category_encoders

In [801]:
from category_encoders import BinaryEncoder

be = BinaryEncoder()

x_train_be = be.fit_transform(x_train[['Brand']])

x_test_be = be.transform(x_test[['Brand']])

In [802]:
x_train_be

Unnamed: 0,Brand_0,Brand_1,Brand_2,Brand_3,Brand_4
0,0,0,0,0,1
1,0,0,0,1,0
2,0,0,0,1,1
3,0,0,1,0,0
4,0,0,1,0,1
...,...,...,...,...,...
4643,0,0,1,0,0
4644,0,0,1,0,1
4645,0,1,1,0,1
4646,0,0,1,0,0


In [803]:
x_test_be

Unnamed: 0,Brand_0,Brand_1,Brand_2,Brand_3,Brand_4
0,1,0,0,0,0
1,0,0,0,0,1
2,0,1,1,1,0
3,0,0,1,1,0
4,0,0,0,0,1
...,...,...,...,...,...
1157,0,0,1,0,0
1158,0,0,1,0,0
1159,0,0,1,1,1
1160,0,0,1,0,0


In [804]:
x_train = pd.concat([x_train, x_train_be], axis= 1).drop('Brand', axis= 1)

x_test = pd.concat([x_test, x_test_be], axis= 1).drop('Brand', axis= 1)

In [805]:
x_train

Unnamed: 0,Year,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,Location_Bangalore,Location_Chennai,Location_Coimbatore,...,Location_Mumbai,Location_Pune,Fuel_Type_Diesel,Fuel_Type_Petrol,Transmission_Manual,Brand_0,Brand_1,Brand_2,Brand_3,Brand_4
0,0.25,0.242041,First,0.017241,-0.384416,-0.127183,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1
1,-0.25,1.377523,Second,-0.267241,0.006494,0.064397,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,1,0
2,-0.75,1.069609,Second,0.258621,0.616883,0.714803,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0,0,0,1,1
3,0.75,-0.312402,First,0.201724,0.115584,0.523223,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0,0,1,0,0
4,-0.25,0.348884,First,0.236207,-0.041558,-0.159382,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4643,0.25,0.080461,First,0.696552,-0.125974,-0.080013,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,1,0,0
4644,0.00,0.138737,First,0.196552,-0.041558,0.237463,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0,0,1,0,1
4645,-1.00,2.717488,Second,0.086207,-0.318182,-0.011269,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,1,1,0,1
4646,0.50,-1.175997,First,-0.225862,-0.384416,-0.190614,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0,0,1,0,0


In [806]:
from sklearn.preprocessing import OrdinalEncoder

ord = OrdinalEncoder(categories= [['Third & Above', 'Second', 'First']])

x_train_ord = ord.fit_transform(x_train[['Owner_Type']])

x_test_ord = ord.transform(x_test[['Owner_Type']])

In [807]:
x_train_ord = pd.DataFrame(x_train_ord, columns= ord.get_feature_names_out())

x_test_ord = pd.DataFrame(x_test_ord, columns= ord.get_feature_names_out())

In [808]:
x_train.drop('Owner_Type', axis= 1, inplace= True)

x_test.drop('Owner_Type', axis= 1, inplace= True)

In [809]:
x_train.duplicated().sum()

12

In [810]:
x_test.duplicated().sum()

2

In [811]:
dup_idx = x_train[x_train.duplicated()].index
dup_idx

Index([1507, 1983, 2476, 2497, 3261, 3512, 4131, 4213, 4421, 4521, 4530, 4586], dtype='int64')

In [812]:
x_train_be.drop(dup_idx, inplace= True)

x_train_be.reset_index(inplace= True, drop= True)

x_train_be

Unnamed: 0,Brand_0,Brand_1,Brand_2,Brand_3,Brand_4
0,0,0,0,0,1
1,0,0,0,1,0
2,0,0,0,1,1
3,0,0,1,0,0
4,0,0,1,0,1
...,...,...,...,...,...
4631,0,0,1,0,0
4632,0,0,1,0,1
4633,0,1,1,0,1
4634,0,0,1,0,0


In [813]:
x_train.drop_duplicates(inplace= True)

x_train.reset_index(inplace= True, drop= True)

In [814]:
x_train.duplicated().sum()

0

In [815]:
y_train.reset_index(drop= True, inplace= True)

y_train.drop(dup_idx, axis= 0, inplace= True)

y_train.reset_index(drop= True, inplace= True)

In [816]:
x_train

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Location_Bangalore,Location_Chennai,Location_Coimbatore,Location_Delhi,...,Location_Mumbai,Location_Pune,Fuel_Type_Diesel,Fuel_Type_Petrol,Transmission_Manual,Brand_0,Brand_1,Brand_2,Brand_3,Brand_4
0,0.25,0.242041,0.017241,-0.384416,-0.127183,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,0,1
1,-0.25,1.377523,-0.267241,0.006494,0.064397,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,1,0
2,-0.75,1.069609,0.258621,0.616883,0.714803,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0,0,0,1,1
3,0.75,-0.312402,0.201724,0.115584,0.523223,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0,0,1,0,0
4,-0.25,0.348884,0.236207,-0.041558,-0.159382,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4631,0.25,0.080461,0.696552,-0.125974,-0.080013,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,1,0,0
4632,0.00,0.138737,0.196552,-0.041558,0.237463,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0,0,1,0,1
4633,-1.00,2.717488,0.086207,-0.318182,-0.011269,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,1,1,0,1
4634,0.50,-1.175997,-0.225862,-0.384416,-0.190614,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0,0,1,0,0


In [817]:
dup_idx = x_test[x_test.duplicated()].index
dup_idx

Index([510, 1080], dtype='int64')

In [818]:
x_test_be.drop(dup_idx, inplace= True)

x_test_be.reset_index(inplace= True, drop= True)

x_test_be

Unnamed: 0,Brand_0,Brand_1,Brand_2,Brand_3,Brand_4
0,1,0,0,0,0
1,0,0,0,0,1
2,0,1,1,1,0
3,0,0,1,1,0
4,0,0,0,0,1
...,...,...,...,...,...
1155,0,0,1,0,0
1156,0,0,1,0,0
1157,0,0,1,1,1
1158,0,0,1,0,0


In [819]:
x_test.drop_duplicates(inplace= True)

x_test.reset_index(inplace= True, drop= True)

In [820]:
x_test.duplicated().sum()

0

In [821]:
y_test.reset_index(drop= True, inplace= True)

y_test.drop(dup_idx, axis= 0, inplace= True)

y_test.reset_index(drop= True, inplace= True)

In [822]:
x_test

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Location_Bangalore,Location_Chennai,Location_Coimbatore,Location_Delhi,...,Location_Mumbai,Location_Pune,Fuel_Type_Diesel,Fuel_Type_Petrol,Transmission_Manual,Brand_0,Brand_1,Brand_2,Brand_3,Brand_4
0,-0.75,-0.026104,-0.531034,-0.114286,-0.381550,3.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1,0,0,0,0
1,-0.25,-0.341805,0.137931,-0.318182,-0.318764,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,0,0,1
2,-0.25,1.086775,0.351724,0.136364,0.159382,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0,1,1,1,0
3,0.00,-0.189984,0.982759,0.006494,0.078886,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,1,1,0
4,0.25,-0.434869,0.810345,-0.318182,-0.318120,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155,0.00,0.947722,0.320690,-0.384416,-0.238268,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0,0,1,0,0
1156,1.00,-0.806065,0.017241,-0.384416,-0.191097,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,1,0,0
1157,-0.75,0.442075,-1.112069,0.890909,0.423408,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,1,1,1
1158,-1.50,0.947722,-0.293103,0.000000,0.262416,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,1,0,0


In [823]:
y_test

0        2.50
1        6.46
2        5.00
3        5.00
4        4.27
        ...  
1155     2.80
1156     7.58
1157     7.45
1158     2.25
1159    20.00
Name: Price, Length: 1160, dtype: float64

In [824]:
x_train = pd.concat([x_train, x_train_be], axis= 1)

x_test = pd.concat([x_test, x_test_be], axis= 1)

In [825]:
x_train

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Location_Bangalore,Location_Chennai,Location_Coimbatore,Location_Delhi,...,Brand_0,Brand_1,Brand_2,Brand_3,Brand_4,Brand_0.1,Brand_1.1,Brand_2.1,Brand_3.1,Brand_4.1
0,0.25,0.242041,0.017241,-0.384416,-0.127183,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,1,0,0,0,0,1
1,-0.25,1.377523,-0.267241,0.006494,0.064397,0.0,0.0,1.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,-0.75,1.069609,0.258621,0.616883,0.714803,0.0,1.0,0.0,0.0,0.0,...,0,0,0,1,1,0,0,0,1,1
3,0.75,-0.312402,0.201724,0.115584,0.523223,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0
4,-0.25,0.348884,0.236207,-0.041558,-0.159382,0.0,0.0,0.0,0.0,1.0,...,0,0,1,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4631,0.25,0.080461,0.696552,-0.125974,-0.080013,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0
4632,0.00,0.138737,0.196552,-0.041558,0.237463,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,1,0,1
4633,-1.00,2.717488,0.086207,-0.318182,-0.011269,0.0,0.0,1.0,0.0,0.0,...,0,1,1,0,1,0,1,1,0,1
4634,0.50,-1.175997,-0.225862,-0.384416,-0.190614,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0


### Machine Learning

In [847]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

le = LinearRegression()

le.fit(x_train, np.log(y_train))

print('Training Score R2 : ', round(le.score(x_train, np.log(y_train)) * 100, 2))   # In this step, it calculates y_predict, then compare it with actual y_train

print('Test Score R2 : ', round(le.score(x_test, np.log(y_test)) * 100, 2))

print('Training Score MSE : ', mean_squared_error(y_train, le.predict(x_train)))   # In this step, it calculates y_predict, then compare it with actual y_train

print('Test Score MSE : ', mean_squared_error(y_test, le.predict(x_test)))

Training Score R2 :  90.0
Test Score R2 :  90.19
Training Score MSE :  157.82178419805754
Test Score MSE :  186.03186154282005


In [848]:
le.predict([x_train.loc[0]])


X does not have valid feature names, but LinearRegression was fitted with feature names



array([1.50377918])

In [827]:
157 ** 0.5

12.529964086141668

In [724]:
y_train.describe()

count    4636.000000
mean        9.383827
std        10.703318
min         0.440000
25%         3.500000
50%         5.750000
75%         9.862500
max        97.070000
Name: Price, dtype: float64

In [718]:
px.histogram(np.log(y_train))

### Lasso

In [838]:
from sklearn.linear_model import Lasso

la = Lasso(0.0001)

la.fit(x_train, np.log(y_train))

print('Training Score R2 : ', round(la.score(x_train, np.log(y_train)) * 100, 2))   # In this step, it calculates y_predict, then compare it with actual y_train

print('Test Score R2 : ', round(la.score(x_test, np.log(y_test)) * 100, 2))

Training Score R2 :  90.0
Test Score R2 :  90.19


In [839]:
from sklearn.linear_model import Ridge

rd = Ridge()

rd.fit(x_train, np.log(y_train))

print('Training Score R2 : ', round(rd.score(x_train, np.log(y_train)) * 100, 2))

print('Test Score R2 : ', round(rd.score(x_test, np.log(y_test)) * 100, 2))

Training Score R2 :  90.0
Test Score R2 :  90.19


In [843]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly = PolynomialFeatures(degree= 2)

x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)

le = LinearRegression()

le.fit(x_train_poly, np.log(y_train))

print('Training Score R2 : ', round(le.score(x_train_poly, np.log(y_train)) * 100, 2))

print('Test Score R2 : ', round(le.score(x_test_poly, np.log(y_test)) * 100, 2))

Training Score R2 :  94.16
Test Score R2 :  93.04


In [845]:
x_train.loc[0]

Year                   0.250000
Kilometers_Driven      0.242041
Mileage                0.017241
Engine                -0.384416
Power                 -0.127183
Seats                  0.000000
Location_Bangalore     0.000000
Location_Chennai       0.000000
Location_Coimbatore    0.000000
Location_Delhi         1.000000
Location_Hyderabad     0.000000
Location_Jaipur        0.000000
Location_Kochi         0.000000
Location_Kolkata       0.000000
Location_Mumbai        0.000000
Location_Pune          0.000000
Fuel_Type_Diesel       0.000000
Fuel_Type_Petrol       1.000000
Transmission_Manual    1.000000
Brand_0                0.000000
Brand_1                0.000000
Brand_2                0.000000
Brand_3                0.000000
Brand_4                1.000000
Brand_0                0.000000
Brand_1                0.000000
Brand_2                0.000000
Brand_3                0.000000
Brand_4                1.000000
Name: 0, dtype: float64