**Business Problem**\
How to increase revenue from car sales by predicting accurate car price based on specifications?

**Objective**\
Build a regression model to predict car price 

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
df = pd.read_csv("CarPrice_Assignment.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

## Data Cleansing

In [18]:
# Check Missing Value
df.isna().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

In [19]:
# Check Outlier
df_num = df.select_dtypes(["int64", "float64"])
col_num = df_num.columns

for col in col_num:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3-q1
    print(f"IQR for {col} = {iqr}")

    fence_high = q3 + 1.5*iqr
    fence_low = q1 - 1.5*iqr

    outliers = df[(df[col] > fence_high) | (df[col] < fence_low)]
    # df = df[(df[col] >= fence_low) & (df[col] <= fence_high)]
    # Since we don't have much data, just leave it outliers
    print(outliers.shape)

IQR for car_ID = 102.0
(0, 26)
IQR for symboling = 2.0
(0, 26)
IQR for wheelbase = 7.900000000000006
(3, 26)
IQR for carlength = 16.799999999999983
(1, 26)
IQR for carwidth = 2.8000000000000114
(8, 26)
IQR for carheight = 3.5
(0, 26)
IQR for curbweight = 790.0
(0, 26)
IQR for enginesize = 44.0
(10, 26)
IQR for boreratio = 0.43000000000000016
(0, 26)
IQR for stroke = 0.30000000000000027
(20, 26)
IQR for compressionratio = 0.8000000000000007
(28, 26)
IQR for horsepower = 46.0
(6, 26)
IQR for peakrpm = 700.0
(2, 26)
IQR for citympg = 11.0
(2, 26)
IQR for highwaympg = 9.0
(3, 26)
IQR for price = 8715.0
(15, 26)


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [22]:
# Check Duplicate
df.duplicated().sum()

0