## Import Libraries

In [2]:
## for loading and preprocessing 
import pandas as pd
import numpy as np 

## for data visualization 
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Data & Creating Copy

In [3]:
## read dataset
data = pd.read_csv("datasets/car_price_prediction.csv")

In [4]:
## create a deep copy 
df = data.copy()

## Data Preview And Understanding

In [5]:
## view the first rows 
df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [6]:
## view the last five rows 
df.tail()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
19232,45798355,8467,-,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,300000 km,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5
19233,45778856,15681,831,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600 km,4.0,Tiptronic,Front,04-May,Left wheel,Red,8
19234,45804997,26108,836,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,116365 km,4.0,Automatic,Front,04-May,Left wheel,Grey,4
19235,45793526,5331,1288,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2,51258 km,4.0,Automatic,Front,04-May,Left wheel,Black,4
19236,45813273,470,753,HYUNDAI,Sonata,2012,Sedan,Yes,Hybrid,2.4,186923 km,4.0,Automatic,Front,04-May,Left wheel,White,12


In [None]:
## check the number of rows and columns 
print(f'Total number of rows: {df.shape[0]} -> Total number of columns: {df.shape[1]}')

In [9]:
## get a summary description of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [None]:
## checking for missing values
df.isnull().sum()

In [None]:
## checking for duplicated values 
df.duplicated().sum()

In [8]:
## checking type on columns
df.dtypes

ID                    int64
Price                 int64
Levy                 object
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume        object
Mileage              object
Cylinders           float64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object

In [10]:
## finding unique instances in each column 
print(np.unique(df['Levy']))

['-' '1011' '1016' '1017' '1018' '1024' '1028' '1030' '1031' '1032' '1036'
 '1039' '1045' '1047' '1051' '1053' '1054' '1055' '1058' '1062' '1064'
 '1076' '1077' '1079' '1080' '1083' '1086' '1090' '1091' '1094' '1095'
 '1099' '1103' '1104' '1107' '1108' '1109' '1110' '1111' '1113' '1118'
 '1132' '1138' '1141' '1148' '115' '1156' '1163' '11706' '1171' '11714'
 '1172' '1174' '1176' '1178' '1185' '1187' '1190' '1194' '1195' '1197'
 '1198' '1202' '1203' '1205' '1211' '1228' '1234' '1236' '1246' '1249'
 '1252' '1266' '1267' '1268' '1272' '1273' '1275' '1277' '1279' '1282'
 '1285' '1286' '1288' '1292' '1296' '1297' '1301' '1304' '1307' '1308'
 '1317' '1323' '1324' '1325' '1327' '1341' '1342' '1345' '1346' '1347'
 '1356' '1357' '1360' '1361' '1363' '1365' '1366' '1368' '1369' '1375'
 '1384' '1387' '1391' '1399' '1405' '1408' '1411' '1413' '1426' '1436'
 '1437' '1438' '1440' '1441' '1442' '1451' '1452' '1466' '1468' '1470'
 '1473' '1474' '1481' '1482' '1486' '1488' '1493' '1502' '1503' '1505'
 

In [16]:
## finding unique instances in each column 
print(np.unique(df['Model']))

['09-Mar' '100' '100 NX' ... 'macan S' 'tC' 'xD']


## Data preprocessing 

In [13]:
## read dataset
data = pd.read_csv("datasets/car_price_prediction.csv", na_values=['-'])

In [15]:
#print(np.unique(data['Levy']))

In [None]:
## replacing missing values 