## Group : Alexandre Fernandes, Jorge Beleza da Silva, Mateus Abdallah Fonseca

## Import libraries

In [197]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Reading the dataset, get some informations and handling missing rows

In [198]:
# Load the dataset and get a peak 
df = pd.read_csv('unclean_data.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuel type,engine size,mileage2,fuel type2,engine size2,reference
0,C Class,2020.0,"£30,495",Automatic,,Diesel,2.0,1200,,,/ad/25017331
1,C Class,2020.0,"£29,989",Automatic,,Petrol,1.5,1000,,,/ad/25043746
2,C Class,2020.0,"£37,899",Automatic,,Diesel,2.0,500,,,/ad/25142894
3,C Class,2019.0,"£30,399",Automatic,,Diesel,2.0,5000,,,/ad/24942816
4,C Class,2019.0,"£29,899",Automatic,,Diesel,2.0,4500,,,/ad/24913660


In [199]:
# Summary of datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4006 entries, 0 to 4005
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         3907 non-null   object 
 1   year          3904 non-null   float64
 2   price         3907 non-null   object 
 3   transmission  3907 non-null   object 
 4   mileage       3808 non-null   object 
 5   fuel type     1329 non-null   object 
 6   engine size   3842 non-null   object 
 7   mileage2      3890 non-null   object 
 8   fuel type2    3808 non-null   object 
 9   engine size2  3808 non-null   object 
 10  reference     3907 non-null   object 
dtypes: float64(1), object(10)
memory usage: 344.4+ KB


In [200]:
# Get the sum of all null values in each column
df.isnull().sum()

model             99
year             102
price             99
transmission      99
mileage          198
fuel type       2677
engine size      164
mileage2         116
fuel type2       198
engine size2     198
reference         99
dtype: int64

In [201]:
# Dropping null values and reset the index
df.dropna(how='all', inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,model,year,price,transmission,mileage,fuel type,engine size,mileage2,fuel type2,engine size2,reference
0,C Class,2020.0,"£30,495",Automatic,,Diesel,2.0,1200,,,/ad/25017331
1,C Class,2020.0,"£29,989",Automatic,,Petrol,1.5,1000,,,/ad/25043746
2,C Class,2020.0,"£37,899",Automatic,,Diesel,2.0,500,,,/ad/25142894
3,C Class,2019.0,"£30,399",Automatic,,Diesel,2.0,5000,,,/ad/24942816
4,C Class,2019.0,"£29,899",Automatic,,Diesel,2.0,4500,,,/ad/24913660
5,C Class,2020.0,"£30,999",Automatic,,Diesel,2.0,1000,,,/ad/25059312
6,C Class,2020.0,"£35,999",Automatic,,Diesel,2.0,500,,,/ad/25418851
7,C Class,2019.0,"£37,990",Automatic,,Petrol,3.0,1412,,,/ad/25449314
8,C Class,2019.0,"£28,990",Automatic,,Diesel,2.0,3569,,,/ad/25046820
9,C Class,2019.0,"£28,990",Automatic,,Diesel,2.0,3635,,,/ad/25046821


## Handling the currency symbols and comma format in price column



In [202]:
# Replace the currency character and the comma from price column
df['price'] = df['price'].str.replace('£', ' ')
df['price'] = df['price'].str.replace(',', '')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuel type,engine size,mileage2,fuel type2,engine size2,reference
0,C Class,2020.0,30495,Automatic,,Diesel,2.0,1200,,,/ad/25017331
1,C Class,2020.0,29989,Automatic,,Petrol,1.5,1000,,,/ad/25043746
2,C Class,2020.0,37899,Automatic,,Diesel,2.0,500,,,/ad/25142894
3,C Class,2019.0,30399,Automatic,,Diesel,2.0,5000,,,/ad/24942816
4,C Class,2019.0,29899,Automatic,,Diesel,2.0,4500,,,/ad/24913660


## Handling non-valid column names

In [203]:
# Rename the names of some columns
df.rename(columns={'fuel type':'fuel_type', 'engine size':'engine_size', 
                   'fuel type2':'fuel_type2','engine size2':'engine_size2'}, inplace=True)
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuel_type,engine_size,mileage2,fuel_type2,engine_size2,reference
0,C Class,2020.0,30495,Automatic,,Diesel,2.0,1200,,,/ad/25017331
1,C Class,2020.0,29989,Automatic,,Petrol,1.5,1000,,,/ad/25043746
2,C Class,2020.0,37899,Automatic,,Diesel,2.0,500,,,/ad/25142894
3,C Class,2019.0,30399,Automatic,,Diesel,2.0,5000,,,/ad/24942816
4,C Class,2019.0,29899,Automatic,,Diesel,2.0,4500,,,/ad/24913660


## Handling missing values in columns: fuel_type

In [204]:
# getting the sum of null values of fuel_type column
df.fuel_type.isnull().sum()

2578

In [205]:
# getting the sum of null values of fuel_type2 column
df.fuel_type2.isnull().sum()

99

In [206]:
# fill missing values in fuel_type2 column with values from fuel_type
df.fuel_type2.fillna(df.fuel_type, inplace=True)

In [207]:
# checking the missing values
df.fuel_type2.isnull().sum()

0

In [208]:
# dropping fuel_type column
df.drop(columns=['fuel_type'], inplace=True)

In [209]:
#rename the fuel_type2 column
df.rename(columns={'fuel_type2':'fuel_type'}, inplace=True)

## Handling missing values in columns: mileage

In [210]:
df.mileage.isnull().sum()

99

In [211]:
df.mileage2.isnull().sum()

17

In [212]:
# fill missing values in mileage2 column with values from mileage
df.mileage2.fillna(df.mileage, inplace=True)

In [213]:
# parse the values of mileage2 to numerical
pd.to_numeric(df['mileage2'], errors='coerce')

0         NaN
1         NaN
2       500.0
3         NaN
4         NaN
        ...  
3902     70.6
3903     64.2
3904     65.7
3905     56.5
3906     64.2
Name: mileage2, Length: 3907, dtype: float64

In [214]:
# remove the comma from mileage2 column
df['mileage2'] = df['mileage2'].str.replace(',', '')

In [215]:
# drop mileage column
df.drop(columns=['mileage'], inplace=True)

## Handling missing values and different value standard in column: engine_size

In [216]:
df.engine_size.isnull().sum()

65

In [217]:
df.engine_size2.isnull().sum()

99

In [218]:
# fill missing values in engine_size column with values from engine_size2
df.engine_size.fillna(df.engine_size2, inplace=True)

In [219]:
# parse the values of engine_size to numerical
pd.to_numeric(df['engine_size'], errors='coerce')

0       2.0
1       1.5
2       2.0
3       2.0
4       2.0
       ... 
3902    NaN
3903    NaN
3904    NaN
3905    NaN
3906    NaN
Name: engine_size, Length: 3907, dtype: float64

In [220]:
# remove the currency sign from engine_size column
df['engine_size'] = df['engine_size'].str.replace('£', '')