In [1]:
# Importing Libaries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [2]:
import matplotlib
np.__version__, pd.__version__, sns.__version__, matplotlib.__version__

('2.0.2', '2.2.3', '0.13.2', '3.9.4')

### Task 1: Data Preparation and Modeling  

#### Data Loading

In [28]:
# Loading the data

df = pd.read_csv("data/Cars.csv")

In [15]:
# printing the first five rows of data
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [16]:
#Checking the shape of the data (x,y) where x is no of row or sample and y is no of column or feature
df.shape

(8128, 13)

In [17]:
# .describe() shows statistical info such as count, mean, std, min, 25%, 50%, 75%, max 
df.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.804011,638271.8,69819.51,5.416719
std,4.044249,806253.4,56550.55,0.959588
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [18]:
# Checking the Data types of car csv data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


#### Data Cleaning and Preprocessing 

##### For the Feature of Owner

In [9]:
# checking unique values for the owner column
df['owner'].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [30]:
# For the feature owner: mapping 'owner' column to appropriate values
owner_map = {
    'First Owner': 1,
    'Second Owner': 2,
    'Third Owner': 3,
    'Fourth & Above Owner': 4,
    'Test Drive Car': 5
} #here we create a dictionary owner_map where keys are the text values, and values are the corresponding numeric values.

df['owner'] = df['owner'].map(owner_map) # here .map() is a function which replaces each value in the column with its corresponding value in the dictionary.
df['owner'].unique() # Checking the mapping again 

array([1, 2, 3, 4, 5])

##### For the feature of Fuel

In [11]:
# checking unique values for the fuel column
df['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [31]:
# remove all rows with CNG and LPG because CNG and LPG use a different mileage system i.e., km/kg which is different from kmpl for Diesel and Petrol
df.drop(df[df['fuel'].isin(['LPG', 'CNG'])].index, inplace=True) # here df[df['fuel'].isin(['LPG', 'CNG'])] filter rows with fuel as 'LPG' or 'CNG', .index extracts the indices of the filtered rows, .drop removes the rows with indices. since inplace = true the changes are made in the original dataframe

df['fuel'].unique() # checking the values in the fuel column after removal

array(['Diesel', 'Petrol'], dtype=object)

##### For the feature of Mileage

In [36]:
# remove “kmpl” and convert the column to numerical type (e.g., float)
df['mileage'] = df['mileage'].str.split().str[0].astype(float) # here df['mileage'].str.split() splits the string at spaces into a list of words (eg: 18.6 kmpl into ['18.6','kmpl']); .str[0] selects the first element ('18.6') from the list created by .str.split() and .astype(float) converts the selected first element (18.6) into float
 # alternate method to remove kmp is using .str.replace(): df['mileage'] = df['mileage'].str.replace(' kmpl', '', regex=False).astype(float)
 #.str.split(): Useful if there are multiple parts of the string you need to process.
 #.str.replace(): Faster and more direct for this specific task.

df['mileage'].head() # checking 

0    23.40
1    21.14
2    17.70
3    23.00
4    16.10
Name: mileage, dtype: float64

##### For the feature Engine

In [32]:
# remove “CC” and convert the column to numerical type (e.g., float)
df['engine'] = df['engine'].str.split().str[0].astype(float)

df['engine'].head() # checking


0    1248.0
1    1498.0
2    1497.0
3    1396.0
4    1298.0
Name: engine, dtype: float64

##### For the feature of Max power

In [34]:
# remove and convert the column to numerical type (e.g., float)

df['max_power'] = df['max_power'].str.split().str[0].astype(float)

df['max_power'].head() # chekcing

0     74.00
1    103.52
2     78.00
3     90.00
4     88.20
Name: max_power, dtype: float64

In [37]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,1,23.4,1248.0,74.0,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,2,21.14,1498.0,103.52,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,3,17.7,1497.0,78.0,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,1,23.0,1396.0,90.0,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,1,16.1,1298.0,88.2,"11.5@ 4,500(kgm@ rpm)",5.0


##### For the feature brand ~ name in the dataset

In [38]:
#taking only the first word and removing the rest
df['name'] = df['name'].str.split().str[0]

df['name'].head()

0     Maruti
1      Skoda
2      Honda
3    Hyundai
4     Maruti
Name: name, dtype: object

In [39]:
# checking unique values for the name column
df['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

##### Drop the feature torque

In [40]:
# dropping the 'torque' column
df.drop('torque', axis = 1, inplace = True) # here torque is the column you want to drop, axis = 1 specify its a column and inplace = True modifies the original DataFrame directly

##### Drop the sample data where owners = test drive cars

In [41]:
# removing rows with owner value of 5 i.e., 'Test Drive Car'
df.drop(df[df['owner'] == 5].index, inplace = True)