## Group : Alexandre Brasileiro Fernandes, Jorge Cordeiro Beleza da Silva, Mateus Abdallah Fonseca

## Import libraries

In [158]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Reading the dataset, get some informations and handling missing rows

In [159]:
# Load the dataset and get a peak 
df = pd.read_csv('unclean_data.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuel type,engine size,mileage2,fuel type2,engine size2,reference
0,C Class,2020.0,"£30,495",Automatic,,Diesel,2.0,1200,,,/ad/25017331
1,C Class,2020.0,"£29,989",Automatic,,Petrol,1.5,1000,,,/ad/25043746
2,C Class,2020.0,"£37,899",Automatic,,Diesel,2.0,500,,,/ad/25142894
3,C Class,2019.0,"£30,399",Automatic,,Diesel,2.0,5000,,,/ad/24942816
4,C Class,2019.0,"£29,899",Automatic,,Diesel,2.0,4500,,,/ad/24913660


In [160]:
# Summary of datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4006 entries, 0 to 4005
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         3907 non-null   object 
 1   year          3904 non-null   float64
 2   price         3907 non-null   object 
 3   transmission  3907 non-null   object 
 4   mileage       3808 non-null   object 
 5   fuel type     1329 non-null   object 
 6   engine size   3842 non-null   object 
 7   mileage2      3890 non-null   object 
 8   fuel type2    3808 non-null   object 
 9   engine size2  3808 non-null   object 
 10  reference     3907 non-null   object 
dtypes: float64(1), object(10)
memory usage: 344.4+ KB


In [161]:
# Get the sum of all null values in each column
df.isnull().sum()

model             99
year             102
price             99
transmission      99
mileage          198
fuel type       2677
engine size      164
mileage2         116
fuel type2       198
engine size2     198
reference         99
dtype: int64

In [162]:
# Dropping null values and reset the index
df.dropna(how='all', inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,model,year,price,transmission,mileage,fuel type,engine size,mileage2,fuel type2,engine size2,reference
0,C Class,2020.0,"£30,495",Automatic,,Diesel,2.0,1200,,,/ad/25017331
1,C Class,2020.0,"£29,989",Automatic,,Petrol,1.5,1000,,,/ad/25043746
2,C Class,2020.0,"£37,899",Automatic,,Diesel,2.0,500,,,/ad/25142894
3,C Class,2019.0,"£30,399",Automatic,,Diesel,2.0,5000,,,/ad/24942816
4,C Class,2019.0,"£29,899",Automatic,,Diesel,2.0,4500,,,/ad/24913660
5,C Class,2020.0,"£30,999",Automatic,,Diesel,2.0,1000,,,/ad/25059312
6,C Class,2020.0,"£35,999",Automatic,,Diesel,2.0,500,,,/ad/25418851
7,C Class,2019.0,"£37,990",Automatic,,Petrol,3.0,1412,,,/ad/25449314
8,C Class,2019.0,"£28,990",Automatic,,Diesel,2.0,3569,,,/ad/25046820
9,C Class,2019.0,"£28,990",Automatic,,Diesel,2.0,3635,,,/ad/25046821


## Handling the currency symbols and comma format in price column



In [163]:
# Replace the currency character and the comma from price column
df['price'] = df['price'].str.replace('£', ' ')
df['price'] = df['price'].str.replace(',', '')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuel type,engine size,mileage2,fuel type2,engine size2,reference
0,C Class,2020.0,30495,Automatic,,Diesel,2.0,1200,,,/ad/25017331
1,C Class,2020.0,29989,Automatic,,Petrol,1.5,1000,,,/ad/25043746
2,C Class,2020.0,37899,Automatic,,Diesel,2.0,500,,,/ad/25142894
3,C Class,2019.0,30399,Automatic,,Diesel,2.0,5000,,,/ad/24942816
4,C Class,2019.0,29899,Automatic,,Diesel,2.0,4500,,,/ad/24913660


## Handling non-valid column names

In [164]:
# Rename the names of some columns
df.rename(columns={'fuel type':'fuel_type', 'engine size':'engine_size', 
                   'fuel type2':'fuel_type2','engine size2':'engine_size2'}, inplace=True)
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuel_type,engine_size,mileage2,fuel_type2,engine_size2,reference
0,C Class,2020.0,30495,Automatic,,Diesel,2.0,1200,,,/ad/25017331
1,C Class,2020.0,29989,Automatic,,Petrol,1.5,1000,,,/ad/25043746
2,C Class,2020.0,37899,Automatic,,Diesel,2.0,500,,,/ad/25142894
3,C Class,2019.0,30399,Automatic,,Diesel,2.0,5000,,,/ad/24942816
4,C Class,2019.0,29899,Automatic,,Diesel,2.0,4500,,,/ad/24913660


## Handling missing values in columns: fuel_type

In [165]:
# getting the sum of null values of fuel_type column
df.fuel_type.isnull().sum()

2578

In [166]:
# getting the sum of null values of fuel_type2 column
df.fuel_type2.isnull().sum()

99

In [167]:
# fill missing values in fuel_type2 column with values from fuel_type
df.fuel_type2.fillna(df.fuel_type, inplace=True)

In [168]:
# checking the missing values
df.fuel_type2.isnull().sum()

0

In [169]:
# dropping fuel_type column
df.drop(columns=['fuel_type'], inplace=True)

In [170]:
#rename the fuel_type2 column
df.rename(columns={'fuel_type2':'fuel_type'}, inplace=True)

## Handling missing values in columns: mileage

In [171]:
df.mileage.isnull().sum()

99

In [172]:
df.mileage2.isnull().sum()

17

In [173]:
# fill missing values in mileage2 column with values from mileage
df.mileage2.fillna(df.mileage, inplace=True)

In [174]:
# remove the comma from mileage2 column
df['mileage2'] = df['mileage2'].str.replace(',', '')

In [175]:
# parse the values of mileage2 to numerical
df.mileage2 = pd.to_numeric(df['mileage2'], errors='coerce')

In [176]:
# drop mileage column
df.drop(columns=['mileage'], inplace=True)

In [177]:
#rename the mileage2 column
df.rename(columns={'mileage2':'mileage'}, inplace=True)

## Handling missing values and different value standard in column: engine_size

In [178]:
df.engine_size.isnull().sum()

65

In [179]:
df.engine_size2.isnull().sum()

99

In [180]:
# fill missing values in engine_size column with values from engine_size2
df.engine_size.fillna(df.engine_size2, inplace=True)

In [181]:
# remove the currency sign from engine_size column
df['engine_size'] = df['engine_size'].str.replace('£', '')

In [182]:
# parse the values of engine_size to numerical
df['engine_size'] = pd.to_numeric(df['engine_size'], errors='coerce')

In [183]:
# Standardizing the values into engine size
standard_size = lambda x: round(x/1000,1) if x > 1000 else round(x,1)
df['engine_size'] = df['engine_size'].apply(standard_size)

In [184]:
df.drop(columns=['engine_size2'], inplace=True)

## Handling wrong data type: year

In [185]:
# dropping the remaining null values
df.dropna(inplace=True)

In [186]:
# changing the data type of the year column
df.year = df.year.astype('int64')

## Handling unneeded columns

In [189]:
# dropping unneeded columns
df.drop(columns=['model','reference'], inplace=True)

## Handling categorical data

In [216]:
dummy = pd.get_dummies(df['transmission'])
df.join(dummy)
#dummy_fuelType = dummy_transmission = pd.get_dummies(df['fuel_type'])
#df.join(dummy_fuelType)


Unnamed: 0,year,price,transmission,engine_size,mileage,fuel_type,Automatic,Manual,Other,Semi-Auto
0,2020,30495,Automatic,2.0,1200.0,Diesel,True,False,False,False
1,2020,29989,Automatic,1.5,1000.0,Petrol,True,False,False,False
2,2020,37899,Automatic,2.0,500.0,Diesel,True,False,False,False
3,2019,30399,Automatic,2.0,5000.0,Diesel,True,False,False,False
4,2019,29899,Automatic,2.0,4500.0,Diesel,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
3902,2017,14700,Manual,150.0,70.6,Diesel,False,True,False,False
3903,2018,18500,Automatic,150.0,64.2,Diesel,True,False,False,False
3904,2014,11900,Manual,20.0,65.7,Diesel,False,True,False,False
3905,2014,11300,Automatic,145.0,56.5,Diesel,True,False,False,False
