In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("adverts.csv")

In [3]:
df.head()

Unnamed: 0,public_reference,mileage,reg_code,standard_colour,standard_make,standard_model,vehicle_condition,year_of_registration,price,body_type,crossover_car_and_van,fuel_type
0,202006039777689,0.0,,Grey,Volvo,XC90,NEW,,73970,SUV,False,Petrol Plug-in Hybrid
1,202007020778260,108230.0,61.0,Blue,Jaguar,XF,USED,2011.0,7000,Saloon,False,Diesel
2,202007020778474,7800.0,17.0,Grey,SKODA,Yeti,USED,2017.0,14000,SUV,False,Petrol
3,202007080986776,45000.0,16.0,Brown,Vauxhall,Mokka,USED,2016.0,7995,Hatchback,False,Diesel
4,202007161321269,64000.0,64.0,Grey,Land Rover,Range Rover Sport,USED,2015.0,26995,SUV,False,Diesel


In [4]:
df.tail()

Unnamed: 0,public_reference,mileage,reg_code,standard_colour,standard_make,standard_model,vehicle_condition,year_of_registration,price,body_type,crossover_car_and_van,fuel_type
402000,202010315652942,5179.0,69,Grey,Peugeot,208,USED,2019.0,10595,Hatchback,False,Petrol
402001,202010315657341,110000.0,59,Red,Peugeot,107,USED,2009.0,2000,Hatchback,False,Petrol
402002,202010315659271,52760.0,62,White,Nissan,Qashqai,USED,2012.0,7250,SUV,False,Petrol
402003,202011015662436,10250.0,65,Red,Abarth,595,USED,2015.0,11490,Hatchback,False,Petrol
402004,201512149444029,14000.0,14,Silver,Audi,A4 Avant,USED,2014.0,20520,Estate,False,Diesel


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402005 entries, 0 to 402004
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   public_reference       402005 non-null  int64  
 1   mileage                401878 non-null  float64
 2   reg_code               370148 non-null  object 
 3   standard_colour        396627 non-null  object 
 4   standard_make          402005 non-null  object 
 5   standard_model         402005 non-null  object 
 6   vehicle_condition      402005 non-null  object 
 7   year_of_registration   368694 non-null  float64
 8   price                  402005 non-null  int64  
 9   body_type              401168 non-null  object 
 10  crossover_car_and_van  402005 non-null  bool   
 11  fuel_type              401404 non-null  object 
dtypes: bool(1), float64(2), int64(2), object(7)
memory usage: 34.1+ MB


In [6]:
df.describe()

Unnamed: 0,public_reference,mileage,year_of_registration,price
count,402005.0,401878.0,368694.0,402005.0
mean,202007100000000.0,37743.595656,2015.006206,17341.97
std,16916620000.0,34831.724018,7.962667,46437.46
min,201307200000000.0,0.0,999.0,120.0
25%,202009000000000.0,10481.0,2013.0,7495.0
50%,202009300000000.0,28629.5,2016.0,12600.0
75%,202010200000000.0,56875.75,2018.0,20000.0
max,202011000000000.0,999999.0,2020.0,9999999.0


In [7]:
df.shape

(402005, 12)

In [8]:
# check the columns in data
df.columns

Index(['public_reference', 'mileage', 'reg_code', 'standard_colour',
       'standard_make', 'standard_model', 'vehicle_condition',
       'year_of_registration', 'price', 'body_type', 'crossover_car_and_van',
       'fuel_type'],
      dtype='object')

In [9]:
# check the datatypes of columns
df.dtypes

public_reference           int64
mileage                  float64
reg_code                  object
standard_colour           object
standard_make             object
standard_model            object
vehicle_condition         object
year_of_registration     float64
price                      int64
body_type                 object
crossover_car_and_van       bool
fuel_type                 object
dtype: object

In [10]:
df.count()

public_reference         402005
mileage                  401878
reg_code                 370148
standard_colour          396627
standard_make            402005
standard_model           402005
vehicle_condition        402005
year_of_registration     368694
price                    402005
body_type                401168
crossover_car_and_van    402005
fuel_type                401404
dtype: int64

In [11]:
# check for some null or missing values in our dataset
df.isna().sum().to_numpy()

array([    0,   127, 31857,  5378,     0,     0,     0, 33311,     0,
         837,     0,   601], dtype=int64)

In [12]:
# Fill missing values with mode for categorical columns and median for numerical column
df['standard_colour'].fillna(df['standard_colour'].mode()[0], inplace=True)
df['reg_code'].fillna(df['reg_code'].mode()[0], inplace=True)
df['body_type'].fillna(df['body_type'].mode()[0], inplace=True)
df['fuel_type'].fillna(df['fuel_type'].mode()[0], inplace=True)
df['year_of_registration'].fillna(df['year_of_registration'].median(), inplace=True)

# Check again for missing values
df.isna().sum().to_numpy()

array([  0, 127,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
      dtype=int64)

In [13]:
df.shape

(402005, 12)

In [14]:
# Verify the changes
df['price'].value_counts()

8995     2432
7995     2377
6995     2369
9995     2340
5995     2249
         ... 
16253       1
44150       1
50340       1
76989       1
58987       1
Name: price, Length: 30578, dtype: int64

In [20]:
# Label Encoding for categorical columns
label_encoder = LabelEncoder()
categorical_columns = ['reg_code', 'standard_colour', 'standard_make', 'standard_model', 'vehicle_condition', 'body_type',  'fuel_type']
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

# Min-Max Scaling for numerical columns
scaler = MinMaxScaler()
numerical_columns = ['mileage', 'year_of_registration', 'price']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Verify the changes
df.head()


Unnamed: 0,public_reference,mileage,reg_code,standard_colour,standard_make,standard_model,vehicle_condition,year_of_registration,price,body_type,crossover_car_and_van,fuel_type
0,202006039777689,0.0,15,8,106,1107,0,0.996082,0.007385,13,0,8
1,202007020778260,0.10823,31,2,47,1110,1,0.991185,0.000688,14,0,1
2,202007020778474,0.0078,15,8,91,1130,1,0.997062,0.001388,13,0,6
3,202007080986776,0.045,14,4,104,702,1,0.996082,0.000788,7,0,1
4,202007161321269,0.064,34,8,54,833,1,0.995103,0.002688,13,0,1


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402005 entries, 0 to 402004
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   public_reference       402005 non-null  int64  
 1   mileage                401878 non-null  float64
 2   reg_code               402005 non-null  int64  
 3   standard_colour        402005 non-null  int64  
 4   standard_make          402005 non-null  int64  
 5   standard_model         402005 non-null  int64  
 6   vehicle_condition      402005 non-null  int64  
 7   year_of_registration   402005 non-null  float64
 8   price                  402005 non-null  float64
 9   body_type              402005 non-null  int64  
 10  crossover_car_and_van  402005 non-null  int64  
 11  fuel_type              402005 non-null  int64  
dtypes: float64(3), int64(9)
memory usage: 36.8 MB


In [22]:
# Separate features (X) and target variable (y)
X = df.drop(columns=['price'])  # Features
y = df['price']  # Target variable

# Split the dataset into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (321604, 11)
Shape of X_test: (80401, 11)
Shape of y_train: (321604,)
Shape of y_test: (80401,)


In [23]:
from sklearn.ensemble import VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Create instances of the regressors
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()

# Combine regressors using VotingRegressor
regressors = [
    ('dtr', dtr),
    ('rfr', rfr),
]

voting_regressor = VotingRegressor(regressors)

# Train the voting regressor on the training data
voting_regressor.fit(X_train, y_train)

In [25]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predictions on training set
y_train_pred = voting_regressor.predict(X_train)

# Predictions on test set
y_test_pred = voting_regressor.predict(X_test)

# Calculate metrics for training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate metrics for test set
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print the results
print("Training Set Metrics:")
print(f"Mean Squared Error (MSE): {train_mse:.4f}")
print(f"Mean Absolute Error (MAE): {train_mae:.4f}")
print(f"R^2 Score: {train_r2:.4f}")
print("\nTest Set Metrics:")
print(f"Mean Squared Error (MSE): {test_mse:.4f}")
print(f"Mean Absolute Error (MAE): {test_mae:.4f}")
print(f"R^2 Score: {test_r2:.4f}")

Training Set Metrics:
Mean Squared Error (MSE): 0.0000
Mean Absolute Error (MAE): 0.0003
R^2 Score: 0.8317

Test Set Metrics:
Mean Squared Error (MSE): 0.0000
Mean Absolute Error (MAE): 0.0003
R^2 Score: 0.1537
