In [42]:
# Importing required libraries
import seaborn as snL
import pandas as pd
import numpy as np
from datetime import datetime

In [43]:
# Reading data source
df_csv = pd.read_csv('./train.csv')

In [44]:
# Data Cleaning
# - total count
len(df_csv)

5847

In [45]:
df_csv.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
1,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,13 km/kg,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
2,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
3,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
4,6,Nissan Micra Diesel XV,Jaipur,2013,86999,Diesel,Manual,First,23.08 kmpl,1461 CC,63.1 bhp,5.0,,3.5


In [46]:
df_csv.isnull().sum()

Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  38
New_Price            5032
Price                   0
dtype: int64

a. Since the median is less impacted by extreme values (outliers) in the dataset, it is a better option than the mean or mode in this situation. Car prices can fluctuate widely, and the mean might be distorted by outliers, which lessens its representativeness of the data's core trend. In contrast, the median represents the middle value after sorting the data and provides a more reliable indicator of central tendency when outliers are present.

In order replace null values in 'New_Price' column, firstly it has be all numeric valued. 



In [47]:
# filtering out 'New_Price' column
df_csv['New_Price'] = df_csv['New_Price'].astype(str)
df_csv['New_Price'] = df_csv['New_Price'].str.extract(r'([\d.]+)').astype(float)

In [48]:
# Calcuating the median of the 'New_Price' column
median_new_price = df_csv['New_Price'].median()
# Replaciong null values
df_csv['New_Price'].fillna(median_new_price, inplace=True)
print(df_csv)

      Unnamed: 0                              Name    Location  Year  \
0              1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1              2                      Honda Jazz V     Chennai  2011   
2              3                 Maruti Ertiga VDI     Chennai  2012   
3              4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4              6            Nissan Micra Diesel XV      Jaipur  2013   
...          ...                               ...         ...   ...   
5842        6014                  Maruti Swift VDI       Delhi  2014   
5843        6015          Hyundai Xcent 1.1 CRDi S      Jaipur  2015   
5844        6016             Mahindra Xylo D4 BSIV      Jaipur  2012   
5845        6017                Maruti Wagon R VXI     Kolkata  2013   
5846        6018             Chevrolet Beat Diesel   Hyderabad  2011   

      Kilometers_Driven Fuel_Type Transmission Owner_Type     Mileage  \
0                 41000    Diesel       Manual      First  19.

b. Converting 'Power','Engine','Mileage' columns to numeric values by filtering out the 'bhp','CC','kmpl' respectivitly

In [50]:
# function 
df_csv['Power'].astype(str)
print(df_csv)

def filterOut(value):
    if pd.isnull(value):
        return None
    index = value.find(" ")
    if index != -1:
        numeric = value[:index]
    else:
        numeric = value
    return(numeric)

      Unnamed: 0                              Name    Location  Year  \
0              1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1              2                      Honda Jazz V     Chennai  2011   
2              3                 Maruti Ertiga VDI     Chennai  2012   
3              4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4              6            Nissan Micra Diesel XV      Jaipur  2013   
...          ...                               ...         ...   ...   
5842        6014                  Maruti Swift VDI       Delhi  2014   
5843        6015          Hyundai Xcent 1.1 CRDi S      Jaipur  2015   
5844        6016             Mahindra Xylo D4 BSIV      Jaipur  2012   
5845        6017                Maruti Wagon R VXI     Kolkata  2013   
5846        6018             Chevrolet Beat Diesel   Hyderabad  2011   

      Kilometers_Driven Fuel_Type Transmission Owner_Type     Mileage  \
0                 41000    Diesel       Manual      First  19.

In [51]:
df_csv['Mileage'] = df_csv['Mileage'].apply(filterOut)
df_csv['Power'] = df_csv['Power'].apply(filterOut)
df_csv['Engine'] = df_csv['Engine'].apply(filterOut)

In [52]:
print(df_csv)

      Unnamed: 0                              Name    Location  Year  \
0              1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1              2                      Honda Jazz V     Chennai  2011   
2              3                 Maruti Ertiga VDI     Chennai  2012   
3              4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4              6            Nissan Micra Diesel XV      Jaipur  2013   
...          ...                               ...         ...   ...   
5842        6014                  Maruti Swift VDI       Delhi  2014   
5843        6015          Hyundai Xcent 1.1 CRDi S      Jaipur  2015   
5844        6016             Mahindra Xylo D4 BSIV      Jaipur  2012   
5845        6017                Maruti Wagon R VXI     Kolkata  2013   
5846        6018             Chevrolet Beat Diesel   Hyderabad  2011   

      Kilometers_Driven Fuel_Type Transmission Owner_Type Mileage Engine  \
0                 41000    Diesel       Manual      First  

c. Converting the 'Fuel_Type' and 'Transmission' into  categorical variables

In [53]:
df_csv=pd.get_dummies(dataframe,columns=['Fuel_Type','Transmission'])
print(df_csv)

      Unnamed: 0                              Name    Location  Year  \
0              1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1              2                      Honda Jazz V     Chennai  2011   
2              3                 Maruti Ertiga VDI     Chennai  2012   
3              4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4              6            Nissan Micra Diesel XV      Jaipur  2013   
...          ...                               ...         ...   ...   
5842        6014                  Maruti Swift VDI       Delhi  2014   
5843        6015          Hyundai Xcent 1.1 CRDi S      Jaipur  2015   
5844        6016             Mahindra Xylo D4 BSIV      Jaipur  2012   
5845        6017                Maruti Wagon R VXI     Kolkata  2013   
5846        6018             Chevrolet Beat Diesel   Hyderabad  2011   

      Kilometers_Driven Owner_Type     Mileage   Engine      Power  Seats  \
0                 41000      First  19.67 kmpl  1582 CC  1

d. Adding new feature to the dataset to calculate the current age of the car

In [55]:
current_year = datetime.now().year
df_csv['Car_Age'] = current_year - df_csv['Year']
print(df_csv)

      Unnamed: 0                              Name    Location  Year  \
0              1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1              2                      Honda Jazz V     Chennai  2011   
2              3                 Maruti Ertiga VDI     Chennai  2012   
3              4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4              6            Nissan Micra Diesel XV      Jaipur  2013   
...          ...                               ...         ...   ...   
5842        6014                  Maruti Swift VDI       Delhi  2014   
5843        6015          Hyundai Xcent 1.1 CRDi S      Jaipur  2015   
5844        6016             Mahindra Xylo D4 BSIV      Jaipur  2012   
5845        6017                Maruti Wagon R VXI     Kolkata  2013   
5846        6018             Chevrolet Beat Diesel   Hyderabad  2011   

      Kilometers_Driven Owner_Type     Mileage   Engine      Power  Seats  \
0                 41000      First  19.67 kmpl  1582 CC  1

In [57]:
# exporting the result dataset
df_csv.to_csv('result.csv')