In [1]:
# Importing required libraries.
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/fenago/MLEssentials2/main/datasets/car_features_and_msrp.csv')

In [3]:
df.sample(5)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
11477,Suzuki,X-90,1996,regular unleaded,95.0,4.0,MANUAL,rear wheel drive,2.0,,Compact,2dr SUV,26,22,481,2000
277,Nissan,350Z,2008,regular unleaded,306.0,6.0,AUTOMATIC,rear wheel drive,2.0,High-Performance,Compact,Coupe,24,17,2009,37740
8259,Land Rover,Range Rover Evoque,2016,premium unleaded (recommended),240.0,4.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Luxury",Compact,4dr SUV,30,21,258,41475
8434,Acura,RDX,2017,premium unleaded (recommended),279.0,6.0,AUTOMATIC,front wheel drive,4.0,"Crossover,Luxury",Midsize,4dr SUV,28,20,204,42220
7853,Infiniti,Q70,2014,premium unleaded (required),420.0,8.0,AUTOMATIC,all wheel drive,4.0,"Luxury,Performance",Large,Sedan,23,16,190,64600


In [4]:
# Check the types of Data
df.dtypes

Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object

In [5]:
# Drop irrelevant columns
df = df.drop(['Engine Fuel Type', 'Market Category', 'Vehicle Style', 'Popularity', 'Number of Doors', 'Vehicle Size'], axis=1)
df.head(5)

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,rear wheel drive,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,rear wheel drive,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,rear wheel drive,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,rear wheel drive,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,rear wheel drive,28,18,34500


In [6]:
df = df.rename(columns={"Engine HP": "HP","Engine Cylinders":"Cylinders","Tranmission Type": "Transmission","Driven_Wheels":"Drive Mode","highway MPG":"MPG-H","city mpg":"MPG-C","MSRP":"Price"})

In [7]:
df.sample(3)

Unnamed: 0,Make,Model,Year,HP,Cylinders,Transmission Type,Drive Mode,MPG-H,MPG-C,Price
3865,Hyundai,Elantra,2016,166.0,4.0,AUTOMATIC,front wheel drive,35,24,21250
4262,Lotus,Evora,2013,345.0,6.0,MANUAL,rear wheel drive,26,17,77100
6413,Subaru,Legacy,2017,175.0,4.0,AUTOMATIC,all wheel drive,34,25,28840


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   HP                 11845 non-null  float64
 4   Cylinders          11884 non-null  float64
 5   Transmission Type  11914 non-null  object 
 6   Drive Mode         11914 non-null  object 
 7   MPG-H              11914 non-null  int64  
 8   MPG-C              11914 non-null  int64  
 9   Price              11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


In [9]:
df.shape

(11914, 10)

In [17]:
# Rows with duplicate data
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (0, 10)


In [12]:
df.count()

Make                 11914
Model                11914
Year                 11914
HP                   11845
Cylinders            11884
Transmission Type    11914
Drive Mode           11914
MPG-H                11914
MPG-C                11914
Price                11914
dtype: int64

In [15]:
df = df.drop_duplicates()
df.sample(5)

Unnamed: 0,Make,Model,Year,HP,Cylinders,Transmission Type,Drive Mode,MPG-H,MPG-C,Price
11596,Volvo,XC60,2016,302.0,4.0,AUTOMATIC,all wheel drive,28,20,43350
8350,Ford,Ranger,2010,207.0,6.0,MANUAL,four wheel drive,19,15,25800
5126,Infiniti,G20,2001,145.0,4.0,AUTOMATIC,front wheel drive,28,20,24895
5202,Genesis,G80,2017,311.0,6.0,AUTOMATIC,rear wheel drive,28,18,41400
5494,Dodge,Grand Caravan,2016,283.0,6.0,AUTOMATIC,front wheel drive,25,17,22595


In [16]:
df.count()

Make                 10925
Model                10925
Year                 10925
HP                   10856
Cylinders            10895
Transmission Type    10925
Drive Mode           10925
MPG-H                10925
MPG-C                10925
Price                10925
dtype: int64

In [18]:
# Find the missing values
print(df.isnull().sum())

Make                  0
Model                 0
Year                  0
HP                   69
Cylinders            30
Transmission Type     0
Drive Mode            0
MPG-H                 0
MPG-C                 0
Price                 0
dtype: int64


In [None]:
df = df.dropna()
df.count()

In [None]:
df.isnull().sum()

In [None]:
# Outliers
sns.boxplot(x=df['Price'])

In [None]:
sns.boxplot(x=df['Engine HP'])

In [None]:
sns.boxplot(x=df['Cylinders'])

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)

In [None]:
IQR = Q3 - Q1

In [None]:
print(IQR)

In [None]:
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

In [None]:
sns.boxplot(x=df['Price'])

In [None]:
sns.boxplot(x=df['Engine HP'])

In [None]:
sns.boxplot(x=df['Cylinders'])

In [None]:
# Plotting a Histogram
df.Make.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title("Number of cars by make")
plt.ylabel('Number of cars')
plt.xlabel('Make');

In [None]:
# Finding the relations between the variables.
plt.figure(figsize=(20,10))
c= df.corr()
sns.heatmap(c,cmap='BrBG',annot=True)
c

In [None]:
# Plotting a scatter plot
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['Engine HP'], df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()