# Cars 4 You: Expending Car Evaluations with ML

## 1. Import the needed libraries

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.feature_selection import mutual_info_regression
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
import numpy as np
import os
from sklearn.metrics import make_scorer 
from sklearn.model_selection import PredefinedSplit

## 2. Data importation and integration

In [None]:
train_data = pd.read_csv('project_data/train.csv')
test_data = pd.read_csv('project_data/test.csv')

In [None]:
data= train_data.copy()

## 3. Data exploration and understanding

### Metadata
- *carID*: An attribute that contains an identifier for each car.
- *Brand*: The car’s main brand (e.g. Ford, Toyota).
- *model*: The car model.
- *year*: The year of Registration of the Car.
- *mileage*: The total reported distance travelled by the car (in
 miles).
- *tax*: The amount of road tax (in £) that, in 2020, was
 applicable to the car in question.
- *fuelType*: Type of Fuel used by the car (Diesel, Petrol, Hybrid,
 Electric).
- *mpg*: Average Miles per Gallon.
- *engineSize*: Size of Engine in liters (Cubic Decimeters).
- *paintQuality%*:  The mechanic’s assessment of the cars’ overall paint
 quality and hull integrity (filled by the mechanic
 during evaluation). 
- *previousOwners*: Number of previous registered owners of the vehicle.
- *hasDamage*:  Boolean marker filled by the seller at the time of
 registration stating whether the car is damaged or
 not.
- *price*: The car’s price when purchased by Cars 4 You (in £).

### 3.1. Data Overview

In [None]:
#overview the dataset
data.info()

In [None]:
#first 20 rows
data.head(20)

In [None]:
#last 20 rows
data.tail(20)

In [None]:
#Descriptive statistics for numerical data
data.describe().T

In [None]:
#Descriptive statistics for categorical data
data.describe(include = ['O'])

In [None]:
#separate numerical and categorical features 

metric_features = ['year', 'mileage', 'tax', 'mpg',
                    'engineSize', 'paintQuality%', 'previousOwners', 'hasDamage']

non_metric_features= ['Brand','model','transmission','fuelType']

identifier = 'carID'

target = 'price'

In [None]:
#checking what are the unique values of categorical variables
for col in non_metric_features:
    print(f"\nColumn: {col}")
    print(data[col].unique())

### 3.2. Checking Duplicates

In [None]:
#Number of duplicates
data.duplicated().sum()

### 3.3. Checking Missing Values

In [None]:
#Number of missing values
data.isna().sum()

In [None]:
#Number of missing values in each column as a percentage
data.isna().sum()/len(data) * 100

### 3.4. Checking Outliers

In [None]:
# Checking outliers of numerical variables through the visualization of boxplots

def plot_multiple_boxplots(data, feats, title="Numeric Variables' Box Plots"):

    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(4, ceil(len(feats) / 4), figsize=(40, 30))

    # Plot data
    # Iterate across axes objects and associate each histogram:
    for ax, feat in zip(axes.flatten(), feats):
        sns.boxplot(x=data[feat], ax=ax, color="#5dade2")
        ax.set_title(feat)

    # Layout
    plt.suptitle(title)
    plt.show()

    return

In [None]:
plot_multiple_boxplots(data, metric_features)

### 3.5. Checking Distributions

In [None]:
df= pd.DataFrame(data[metric_features])

In [None]:
# Creating histograms to see the distribrution of numerical variables

num_cols = df.select_dtypes(include=['number']).columns
n = len(num_cols)

# Adjust layout
fig, axes = plt.subplots(nrows=(n // 3) + 1, ncols=3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    axes[i].hist(df[col].dropna(), bins=20, color='#5dade2', edgecolor='black')  
    axes[i].set_title(col)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('frequency')

# Remove empty axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

### 3.5. Checking Categorical Variables

In [None]:
# Creating barplots to understand the categorical data

sns.set_style('white')  
sns.set_palette(['#5dade2'])  

# Create 4 subplots stacked vertically
fig, ax = plt.subplots(nrows=4, ncols=1, dpi=300, figsize=(20, 40))
fig.patch.set_facecolor('white') 

# Plot each variable in its own row
sns.countplot(data=test_data, x='Brand', ax=ax[0])
sns.countplot(data=test_data, x='model', ax=ax[1])
sns.countplot(data=test_data, x='transmission', ax=ax[2])
sns.countplot(data=test_data, x='fuelType', ax=ax[3])

# Improve spacing between plots
plt.tight_layout()
plt.show()

### 3.6. Checking Correlation between Variables

In [None]:
# Check correlation between variables 
# We are going to use spearman correlation since our variables do not follow a normal distribution
cor_spearman = data[metric_features].corr(method ='spearman')
cor_spearman

In [None]:
# Create correlation matrix to facilitate interpretation

def cor_heatmap(cor):
    plt.figure(figsize=(12, 10))

    # Create a mask for the upper triangle
    mask = np.triu(np.ones_like(cor, dtype=bool))

    # Plot heatmap 
    sns.heatmap(
        data=cor,
        mask=mask,
        annot=True,
        cmap='YlGnBu',   
        fmt='.2f',
        square=True,
        linewidths=0.5,
        cbar_kws={"shrink": 0.8},
    )

    plt.title("Spearman Correlation Matrix", fontsize=14)
    plt.show()

In [None]:
cor_heatmap(cor_spearman)

In [None]:
# Pairwise Relationship of Numerical Variables
sns.set()

# Setting pairplot and use historgrams in the diagonal
sns.pairplot(df[metric_features], diag_kind="hist")

# Layout
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Numerical Variables", fontsize=20)

# Create eda directory 
if not os.path.exists(os.path.join('..', 'figures', 'eda')):
    os.makedirs(os.path.join('..', 'figures', 'eda'))
    
plt.savefig(os.path.join('..', 'figures', 'eda', 'pairwise_numeric_scatterplots.png'), dpi=200)
plt.show()

## 4. Data preparation

### 4.1. Set index

In [None]:
# Set index to CarID as each car has its own unique identifier
data.set_index('carID', inplace = True)
test_data.set_index('carID', inplace = True)

### 4.2. Slipt the data into train and validation dataset

#### The Hold Out Method

In [None]:
X = data.drop('price', axis = 1) # In X, the target variable will be removed and the dataset will be used as the training set
y = data['price']  # y corresponds to the target variable

In [None]:
# Splits the dataset

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.3,  # 30% will be used for validation 
                                                  random_state = 0,      # Ensures the split is always the same every time the code runs
                                                  shuffle = True)        # Shuffles the data before spliting to avoid bias

### 4.3. Changing datatypes

In [None]:
# Rounds the floats and changes them to integers

#year to integer
X_train['year'] = X_train['year'].round().astype('Int32')
X_val['year'] = X_val['year'].round().astype('Int32')
test_data['year'] = test_data['year'].round().astype('Int32')

#previousOwners to integer
X_train['previousOwners'] = X_train['previousOwners'].round().astype('Int32')
X_val['previousOwners'] = X_val['previousOwners'].round().astype('Int32')
test_data['previousOwners'] = test_data['previousOwners'].round().astype('Int32')

#hasDamaged to boolean
X_train['hasDamage'] = X_train['hasDamage'].astype('Int8')
X_val['hasDamage'] = X_val['hasDamage'].astype('Int8')
test_data['hasDamage'] = test_data['hasDamage'].astype('Int8')

In [None]:
X_train.info()

### 4.4. Handling Incoherencies

#### 4.4.1. Categorical Variables 

##### 4.4.1.1. Correcting Spelling Mistakes

In [None]:
##correcting spelling mistakes of 'brand' for X_train, X_val and test_data

correct_brand = {
    'VW': ['V', 'vw', 'v', 'W', 'w'],
    'Toyota': ['Toyot', 'TOYOTA', 'oyota', 'toyota', 'OYOTA', 'TOYOT', 'toyot', 'oyot'],
    'Audi': ['udi', 'AUDI', 'audi', 'Aud', 'aud', 'UDI', 'AUD'],
    'Ford': ['FOR', 'ord', 'For', 'FORD', 'ford', 'for', 'or', 'ORD'],
    'BMW': ['MW', 'bmw', 'BM', 'mw', 'M', 'bm'],
    'Skoda': ['koda', 'skoda', 'SKODA', 'Skod', 'kod', 'SKOD', 'KODA', 'skod'],
    'Opel': ['Ope', 'opel', 'pel', 'pe', 'OPEL', 'PEL', 'OPE', 'ope'],
    'Mercedes': ['mercedes', 'Mercede', 'MERCEDES', 'ercedes', 'mercede', 'ERCEDES', 'ercede', 'MERCEDE'],
    'Hyundai': ['yundai', 'Hyunda', 'hyundai', 'HYUNDAI', 'yunda', 'HYUNDA', 'ud', 'hyunda', 'YUNDAI']
}

# Create a reverse lookup dictionary (each incorrect form maps to the correct one)
replacement_dict = {variant: correct for correct, variants in correct_brand.items() for variant in variants}

# Replace incorrect brand names with the correct ones
X_train["Brand"] = X_train["Brand"].replace(replacement_dict)
X_val["Brand"] = X_val["Brand"].replace(replacement_dict)
test_data["Brand"] = test_data["Brand"].replace(replacement_dict)

# Verify the cleaning
print(X_train["Brand"].unique())
print(X_val["Brand"].unique())
print(test_data["Brand"].unique())

In [None]:
##correcting spelling mistakes of 'model' for X_train, X_val and test_data

correct_model = {
    'Golf': [' GOLF', ' Gol', ' golf', 'golf', ' Golf', ' gol', ' GOL', 'Gol', 'GOLF'],
    'Yaris': [' Yaris', ' YARIS', ' Yari', ' yaris', ' yari', 'Yari', ' YARI', 'yaris', 'YARIS'],
    'Q2': [' q2', ' Q2'],
    '2 Series': [' 2 series', ' 2 serie', '2 Series', ' 2 SERIES', ' 2 Serie', '2 Serie', ' 2 Series'],
    '3 Series': [' 3 Series', ' 3 Serie', ' 3 series', ' 3 SERIES', ' 3 serie', '3 Serie'],
    'A3': [' A3', ' a3'],
    'Octavia': [' Octavi', ' OCTAVIA', ' Octavia', ' octavia', 'Octavi', 'octavia', ' octavi', ' OCTAVI'],
    'Passat': [' PASSAT', ' passat', ' Passa', 'Passat', ' Passat', 'PASSAT', ' PASSA'],
    'Insignia': [' Insigni', ' INSIGNIA', ' insignia', ' Insignia', ' INSIGNI', 'Insigni'],
    'Fabia': [' Fabia', ' fabia', ' FABIA', ' Fabi', 'FABIA'],
    'A Class': [' A Clas', ' A Class', ' a class', ' A CLASS', 'a class', 'A CLASS', ' a clas'],
    'Ka+': [' Ka+', ' KA+', ' ka+', 'ka+'],
    'GLC Class': [' GLC Class', ' GLC CLASS', ' GLC Clas', ' glc class', ' glc clas'], 
    'I30': [' i30', ' I30'],
    'C Class': [' C Clas', ' C CLASS', ' c class', 'C Clas', ' C CLAS', 'c class', ' c clas', ' C Class', 'C CLASS'],
    'Polo': [' POLO', ' Polo', ' polo', ' Pol', ' POL', 'Pol', 'POLO'],
    'E Class': [' E Class', ' E Clas', ' E CLASS', ' e class', 'E CLASS', 'e class'],
    'Q5': [' Q5', ' q5', 'q5'],
    'Up': ['U', ' up', ' UP', ' Up', ' U', 'UP'],
    'Fiesta': [' FIESTA', ' fiesta', ' Fiest', ' Fiesta', 'fiesta', 'Fiest', ' FIESTA', 'FIESTA', ' fiest'],
    'C-HR': [' C-H', ' c-hr', ' C-HR', ' c-h'],
    'Mokka X': [' mokka x', ' MOKKA X', ' Mokka X'],
    'Corsa': [' Corsa', ' corsa', ' Cors', ' CORSA', ' cors', ' CORS', 'corsa'],
    'Astra': [' ASTRA', ' Astr', ' Astra', ' astra', 'ASTRA', 'astra'],
    'TT': [' tt', ' TT', ' T'],
    '5 Series': [' 5 Series', ' 5 Serie', ' 5 SERIES', ' 5 series', '5 SERIES', ' 5 SERIE'],
    'Aygo': [' aygo', ' ayg', ' AYGO', ' Ayg', ' Aygo', 'aygo', ' AYG'],
    '4 Series': [' 4 SERIES', ' 4 Serie', ' 4 serie', '4 series', '4 Series', ' 4 Series', ' 4 series'],
    'SLK': [' slk', ' SLK'],
    'Viva': [' viva', ' Viva', ' VIVA', ' Viv', 'viva'],
    'Focus': [' Focus', ' Focu', ' FOCUS', ' focus', ' FOCU', 'focus', 'Focu', ' focu', 'FOCUS'],
    'EcoSport': [' EcoSpor', ' ECOSPORT', ' ecosport', ' EcoSport'],
    'X-CLASS': [' x-clas', ' X-CLAS', ' x-class', ' X-CLASS'],
    'CL Class': [' cl class', ' CL Clas', ' CL CLASS', ' CL Class'],
    'IX20': [' ix20', ' IX20'],
    'Rapid': [' Rapi', ' rapid', ' Rapid'],
    'Auris': [' Auris', ' AURIS', ' auris', ' Auri'],
    'I20': [' i20', ' I20'],
    'X3': [' x3', ' X3'],
    'A8': [' A8', 'a8'],
    'GLS Class': [' GLS Clas', ' GLS CLASS', ' gls class', ' GLS Class'],
    'B-MAX': [' B-MA', ' B-MAX', 'B-MA', ' b-max'],
    'A4': [' A4', ' a4'],
    'Kona': [' KONA', ' Kon', ' Kona', ' KON', ' kona'],
    'I10': [' i10', ' I10'],
    'A1': [' A1', ' a1'],
    'Mokka': [' Mokka ', ' Mokk', ' Mokka', ' mokka ', ' mokka', ' MOKKA', 'Mokka ', 'Mokk'],
    'S-MAX': [' S-MA', ' s-max', ' S-MAX', ' s-ma'],
    'X2': [' x2', ' X2'],
    'Crossland X': [' crossland x', ' CROSSLAND X', ' Crossland X'],
    'Tiguan': [' Tiguan', ' tiguan', ' Tigua', ' TIGUAN', ' TIGUA', 'Tigua', 'TIGUAN', 'tiguan', ' tigua'],
    'A5': [' A5', ' a5', 'a5'],
    'GLE Class': [' GLE Clas', ' GLE Class', ' gle class', ' GLE CLASS'],
    'Zafira': [' Zafira', ' Zafir', ' ZAFIRA', ' zafira', 'Zafir', ' ZAFIR'],
    'Ioniq': [' Ioni', ' Ioniq', ' IONIQ', 'IONIQ', ' ioniq'],
    'A6': [' A6', ' a6'],
    'Yeti Outdoor': [' yeti outdoor', ' Yeti Outdoor', ' YETI OUTDOOR', ' Yeti Outdoo', ' yeti outdoor', 'yeti outdoor', ' yeti outdoo'],
    'X1': [' x1', 'x1', ' X1'],
    'Scala': [' SCALA', ' Scala', ' scala', ' Scal', ' scal'],
    'S Class': [' S Class', ' S Clas', ' s class', ' S CLASS'],
    '1 Series': [' 1 Series', ' 1 SERIES', ' 1 Serie', ' 1 series', '1 SERIES', ' 1 SERIE', '1 series', ' 1 serie'],
    'Kamiq': [' KAMIQ', ' KAMI', ' kamiq', ' Kamiq'],
    'Kuga': [' Kug', ' KUGA', ' kuga', 'Kuga', ' Kuga', 'kuga'],
    'Tourneo Connect': [' tourneo connect', ' Tourneo Connect'],
    'Q7': [' q7', ' Q7'],
    'GLA Class': [' GLA Class', ' GLA CLASS', ' GLA Clas', ' gla class'],
    'Arteon': [' arteon', ' Arteon', ' Arteon'],
    'SL CLASS': [' SL CLAS', ' SL CLASS', ' sl class', ' SL'],
    'Tucson': [' Tucson', ' TUCSON', ' Tucso', ' tucson', ' TUCSO', 'Tucso'],
    'Santa Fe': [' Santa F', ' santa fe', ' SANTA FE', ' Santa Fe'],
    'Grandland X': [' Grandland X', ' grandland x', ' GRANDLAND X'],
    'RAV4': [' rav4', 'RAV4', 'RAV', ' RAV', 'rav4', ' rav', ' RAV4'],
    'Touran': [' Touran', 'Toura', ' TOURAN', ' touran', ' Toura', ' TOURA'],
    'Citigo': [' Citig', ' citigo', ' Citigo', ' CITIGO', 'CITIGO'],
    'Roomster': [' Roomste', ' Roomster'],
    'Prius': [' PRIUS', ' Prius', 'Prius', ' prius'],
    'Corolla': [' corolla', ' COROLLA', ' Coroll', ' Corolla', 'corolla'],
    'B Class': [' b class', ' B Clas', ' B Class', ' B CLASS', 'b class'],
    'Sharan': [' sharan', ' Shara', ' Sharan', ' SHARAN'],
    'Kodiaq': [' Kodia', ' kodiaq', 'kodiaq', ' KODIAQ', ' Kodiaq'],
    'V Class': [' V Clas', ' V CLASS', ' V Class', ' v class'],
    'Caddy Maxi Life': [' Caddy Maxi Lif', ' Caddy Maxi Life'],
    'Superb': [' Superb', ' Super', ' SUPERB', ' superb', ' super'],
    'T-Roc': [' T-Roc', ' T-RO', ' t-roc', ' T-Ro', ' T-ROC'],
    'Combo Life': [' COMBO LIFE', ' combo life', ' Combo Lif', 'COMBO LIFE', ' Combo Life'],
    'Beetle': [' Beetl', ' Beetle', ' beetle'],
    'Galaxy': [' GALAXY', ' Galax', ' galaxy', ' Galaxy'],
    'M3': [' M3', ' m3'],
    'Gtc': [' gtc', ' GTC', ' gtc', 'gtc', 'GTC'],
    'X4': [' X4', ' x4'],
    'KA': [' Ka', ' ka', ' K', ' KA'],
    'IX35': [' ix35', ' IX35'],
    'Grand Tourneo Connect': [' Grand Tourneo Connec', ' Grand Tourneo Connect'],
    'M4': [' m4', ' M4'],
    'Tourneo Custom': [' tourneo custom', ' Tourneo Custo', ' Tourneo Custom'],
    'Z4': [' Z4', ' z4'],
    'X5': [' X5', ' x5'],
    'Meriva': [' Meriva', ' MERIVA', ' Meriv', ' meriva'],
    'RS6': [' RS6'],
    'Verso': [' VERSO', ' verso', ' Verso', ' Vers'],
    'Touareg': [' Touareg', ' TOUAREG', ' touareg', ' Touare'],
    'Mondeo': [' MONDEO', ' Mondeo', ' mondeo', ' MONDE', ' Monde'],
    'Shuttle': [' shuttle', ' Shuttle', ' SHUTTLE'],
    'CLS Class': [' CLS Class', ' cls class', ' CLS Clas', ' CLS CLASS'], 
    'C-MAX': [' C-MAX', ' c-max', ' C-MA'],
    'Puma': [' puma', ' PUMA', ' Puma', 'Pum', ' Pum'],
    'CLA Class': [' CLA Class', ' CLA CLASS', ' cla class', ' CLA Clas'],
    'I40': [' I40', ' i40'],
    'Q3': [' q3', ' Q3'],
    'Tiguan Allspace': [' TIGUAN ALLSPACE', ' tiguan allspace', ' Tiguan Allspac', ' Tiguan Allspace'],
    '6 Series': [' 6 SERIES', ' 6 series', ' 6 Series', ' 6 Serie'],
    'Caravelle': [' caravelle', ' Caravell', ' Caravelle'],
    'Karoq': [' Karoq', ' karoq', ' KAROQ', ' Karo'],
    'I3': [' i3', 'i3', ' I3'],
    'Grand C-MAX': [' GRAND C-MAX', ' grand c-max', ' Grand C-MA', ' Grand C-MAX'],
    'T-Cross': [' T-Cros', ' T-CROSS', ' T-Cross', ' t-cros', ' t-cross'],
    'A7': [' a7', ' A7'],
    'Golf SV': [' Golf SV', ' golf sv', ' GOLF SV'],
    'A': [' a', ' A'],
    'GT86': [' gt86', ' GT86'],
    'Yeti': [' yeti', ' Yet', ' Yeti', ' YETI'],
    'X': [' x', ' X'],
    'Land Cruiser': [' Land Cruise', ' Land Cruiser', ' land cruiser'],
    'EDGE': [' Edge', ' edge', ' Edg', ' EDGE'],
    'X6': [' X6'],
    'Fusion': [' Fusion', ' fusion'],
    'GL Class': [' GL CLASS', ' gl class', ' GL Class', ' GL Clas'],
    'Scirocco': [' scirocco', ' SCIROCCO', ' Scirocc', ' Scirocco'],
    'Z3': [' Z3'],
    'Hilux': [' hilux', ' Hilux', ' Hilu', ' HILU'],
    'Amarok': [' amarok', ' Amarok', ' Amaro'],
    'CC': [' cc', ' CC'],
    '7 Series': [' 7 Serie', ' 7 SERIES', ' 7 series', ' 7 Series'],
    'Avensis': [' AVENSIS', ' avensis', ' Avensis'],
    'M Class': [' m class', ' M CLASS', ' M Class', ' M Clas', ' M CLAS'],
    'Grandland': [' grandland ', ' Grandland '],
    'Zafira Tourer': [' Zafira Toure', ' ZAFIRA TOURER', ' Zafira Tourer', ' zafira tourer'],
    'R8': [' R8', ' r8'],
    'Mustang': [' mustang', ' Mustang'],
    'Q8': [' Q8'],
    'CLK': [' CLK'],
    'RS3': [' RS3'],
    'Jetta': [' JETTA', ' Jetta', ' jetta', 'Jetta' ],
    'Supra': [' Supra'],
    'X7': [' X7'],
    'SQ7': [' SQ7', ' sq7'],
    'S3': [' s3', ' S3'],
    'Arteon': [' Arteo', 'Arteo', ' ARTEON', ' arteon', ' Arteon'],
    'GLB Class': [' glb class', ' GLB Class'],
    'Adam': [' Ada', ' adam', ' ADAM', ' Adam'],
    'M5': [' M5',' m5'],
    'Golf S': [' golf s', ' Golf S'],
    'Vectra': ['Vectra', ' Vectra', ' VECTRA', 'VECTRA'],
    '8 Series': [' 8 SERIES', ' 8 Serie', ' 8 Series', ' 8 series'],
    'Urban Cruiser': [' Urban Cruise', ' Urban Cruiser'],
    'Fox': ['fox', ' fox', ' Fox'], 
    'Q': [' Q'], 
    'M2': [' M2'], 
    'RS4':[' RS4'], 
    'Veloster': [' Veloster', ' Veloste'],  
    'IQ': [' IQ'], 
    'Agila': [' AGILA', ' Agila'], 
    'I2': [' I2'], 
    'Antara': [' Antara', ' antara'], 
    'G Class': [' G Class', ' G CLAS'], 
    'Caddy Life': [' Caddy Life', ' Caddy'],
    'R Class': [' R Class'], 
    'I800': [' I800'],
    'Amica': [' Amica'], 
    'Crossland': [' Crossland '],
    'Proace Verso': [' proace verso', ' PROACE VERSO', 'PROACE VERSO'],
    'Camry': [' Camry', 'Camry', ' Camr'], 
    'Tigra': [' Tigra'], 
    'Eos': [' Eos'], 
    'M': [' M'],
    'California': [' Californi', ' California'], 
    'Ampera': [' Ampera'], 
    'I1': [' I1'], 
    'S5': [' S5'], 
    'CLC Class': [' CLC Class'], 
    'Shara': [' SHARA'], 
    'I8': [' i8', 'i8'], 
    'RS7': [' RS7'], 
    'Transit Tourneo': [' Transit Tourneo'], 
    'I4': [' I4'], 
    'S4':[' S4'], 
    'Terracan': [' Terracan'], 
    'Cascada': [' Cascada'], 
    'S8': [' S8'], 
    'A2':[' A2'], 
    'Vivaro':[' Vivaro'],
    'RS5':[' RS5'],
    'SQ5':[' SQ5'], 
    'Getz':[' Getz'], 
    'M6':[' M6'], 
    'Caddy Maxi': [' Caddy Maxi'], 
    'Z':[' Z'], 
    'Verso-S': [' Verso-S'], 
    'Kadjar': [' Kadjar'], 
    'I80': [' I80'], 
    'Streetka': [' Streetka'],
    'RS': [' RS'], 
    'I': [' i'], 
    'Ranger': [' Ranger'], 
    'IX2': [' IX2'], 
    'Escort': [' Escort'],
    'Accent': [' Accent']
}


# Create a reverse lookup dictionary (each incorrect form maps to the correct one)
replacement_dict = {variant: correct for correct, variants in correct_model.items() for variant in variants}

# Replace incorrect brand names with the correct ones
X_train["model"] = X_train["model"].replace(replacement_dict)
X_val["model"] = X_val["model"].replace(replacement_dict)
test_data["model"] = test_data["model"].replace(replacement_dict)

# Verify the cleaning
print(X_train["model"].unique())
print(X_val["model"].unique())
print(test_data["model"].unique())

In [None]:
##correcting spelling mistakes of 'transmission' for X_train, X_val and test_data

correct_transmission = {
    'Semi-Auto': ['Semi-Aut', 'semi-auto', 'emi-Auto', 'SEMI-AUTO', 'SEMI-AUT', 'EMI-AUTO', 'emi-Aut', 'emi-auto', 'semi-aut'],
    'Manual': ['anual', 'manual', 'Manua', 'MANUAL', ' Manual ', 'ANUAL', 'manua', 'anua', 'MANUA', ' manual ', ' MANUAL ', ' Manual', 'Manual ', 'manual '],
    'Automatic': ['AUTOMATIC', 'automatic', 'Automati', 'utomatic', 'UTOMATIC', 'automati', 'AUTOMATI', 'utomati'],
}

# Create a reverse lookup dictionary (each incorrect form maps to the correct one)
replacement_dict = {variant: correct for correct, variants in correct_transmission.items() for variant in variants}

# Replace incorrect brand names with the correct ones
X_train["transmission"] = X_train["transmission"].replace(replacement_dict)
X_val["transmission"] = X_val["transmission"].replace(replacement_dict)
test_data["transmission"] = test_data["transmission"].replace(replacement_dict)

#replacing the 'unknown' and 'other' variable with a missing value
X_train["transmission"] = X_train["transmission"].replace(['unknow','UNKNOWN','nknown','nknow', 'unknown', 'Other'], np.nan)
X_val["transmission"] = X_val["transmission"].replace(['unknow','UNKNOWN','nknown','nknow', 'unknown', 'Other'], np.nan)
test_data["transmission"] = test_data["transmission"].replace(['unknow','UNKNOWN','nknown','nknow', 'unknown', 'Other'], np.nan)

# Verify the cleaning
print(X_train["transmission"].unique())
print(X_val["transmission"].unique())
print(test_data["transmission"].unique())

In [None]:
##correcting spelling mistakes of 'fuelType' for X_train, X_val and test_data

correct_fuelType = {
    'Petrol': ['etrol', 'petrol', 'PETROL', 'Petro', 'petro', 'ETROL', 'PETRO', 'etro', 'ETRO'],
    'Diesel': ['diesel','iesel','Diese','DIESEL','DIESE','IESEL','iese','diese','IESE'],
    'Hybrid': ['HYBRID','ybri','Hybri','ybrid','hybrid','YBRID','HYBRI', 'hybri'],
    'Other': ['ther','Othe','OTHER','other']
}

# Create a reverse lookup dictionary (each incorrect form maps to the correct one)
replacement_dict = {variant: correct for correct, variants in correct_fuelType.items() for variant in variants}

# Replace incorrect brand names with the correct ones
X_train["fuelType"] = X_train["fuelType"].replace(replacement_dict)
X_val["fuelType"] = X_val["fuelType"].replace(replacement_dict)
test_data["fuelType"] = test_data["fuelType"].replace(replacement_dict)

#replacing the 'other' variable with a missing value 
X_train["fuelType"] = X_train["fuelType"].replace(['ther','Othe','OTHER','other', 'Other'], np.nan)
X_val["fuelType"] = X_val["fuelType"].replace(['ther','Othe','OTHER','other', 'Other'], np.nan)
test_data["fuelType"] = test_data["fuelType"].replace(['ther','Othe','OTHER','other', 'Other'], np.nan)

# Verify the cleaning
print(X_train["fuelType"].unique())
print(X_val["fuelType"].unique())
print(test_data["fuelType"].unique())

##### 4.4.1.2. Check if the Models correspond to the Brand

**Audi**

In [None]:
#Check the models of Audi for X_train
audi = X_train[X_train['Brand'].str.lower() == 'audi']
unique_audi_models = sorted(audi['model'].dropna().unique())
unique_audi_models

In [None]:
#Check the models of Audi for X_val
audi = X_val[X_val['Brand'].str.lower() == 'audi']
unique_audi_models_v = sorted(audi['model'].dropna().unique())
unique_audi_models_v

In [None]:
#Check the models of Audi for test_data
audi = test_data[test_data['Brand'].str.lower() == 'audi']
unique_audi_models = sorted(audi['model'].dropna().unique())
unique_audi_models

- All models seem to correspond to Audi cars, except for models 'A' and 'Q'. There are no models in Audi that are named 'A' and 'Q' solely, normally there are followed by a number, for example, 'A1' or 'Q3' as seen in the dataset. 
- Since it might be a problem of data collection, in a way that is missing a number after the letter, we have decided to replace these values by missing values. 

In [None]:
#replacing model 'A' by missing values
X_train.loc[X_train["model"] == 'A', "model"] = np.nan
X_val.loc[X_val["model"] == 'A', "model"] = np.nan
test_data.loc[test_data["model"] == 'A', "model"] = np.nan

In [None]:
#replacing model 'Q' by missing values
X_train.loc[X_train["model"] == 'Q', "model"] = np.nan
X_val.loc[X_val["model"] == 'Q', "model"] = np.nan
test_data.loc[test_data["model"] == 'Q', "model"] = np.nan

**BMW**

In [None]:
#Check the models of 'BMW' for X_train
BMW = X_train[X_train['Brand'].str.lower() == 'bmw']
unique_bmw_models = sorted(BMW['model'].dropna().unique())
unique_bmw_models

In [None]:
#Check the models of 'BMW' for X_val
BMW = X_val[X_val['Brand'].str.lower() == 'bmw']
unique_bmw_models_v = sorted(BMW['model'].dropna().unique())
unique_bmw_models_v

In [None]:
#Check the models of 'BMW' for test_data
BMW = test_data[test_data['Brand'].str.lower() == 'bmw']
unique_bmw_models = sorted(BMW['model'].dropna().unique())
unique_bmw_models

- Similarly to 'Audi', in 'BMW' there are also models that should be followed by a number, which are 'X', 'Z', 'I' and 'M'.
- Therefore, we are going to replace it by missing values, for the same reason. 

In [None]:
#replacing model 'X' by missing values
X_train.loc[X_train["model"] == 'X', "model"] = np.nan
X_val.loc[X_val["model"] == 'X', "model"] = np.nan
test_data.loc[test_data["model"] == 'X', "model"] = np.nan

In [None]:
#replacing model 'Z' by missing values
X_train.loc[X_train["model"] == 'Z', "model"] = np.nan

In [None]:
#replacing model 'I' by missing values
X_train.loc[X_train["model"] == 'I', "model"] = np.nan

In [None]:
#replacing model 'M' by missing values
X_train.loc[X_train["model"] == 'M', "model"] = np.nan
X_val.loc[X_val["model"] == 'M', "model"] = np.nan
test_data.loc[test_data["model"] == 'M', "model"] = np.nan

**Ford**

In [None]:
#Check the models of 'Ford' for X_train
Ford = X_train[X_train['Brand'].str.lower() == 'ford']
unique_ford_models = sorted(Ford['model'].dropna().unique())
unique_ford_models

In [None]:
#Check the models of 'Ford' for X_val
Ford = X_val[X_val['Brand'].str.lower() == 'ford']
unique_ford_models_v = sorted(Ford['model'].dropna().unique())
unique_ford_models_v

In [None]:
#Check the models of 'Ford' for test_data
Ford = test_data[test_data['Brand'].str.lower() == 'ford']
unique_ford_models_v = sorted(Ford['model'].dropna().unique())
unique_ford_models_v

- All the models from the lists correspond to Ford models. 

**Hyundai**

In [None]:
#Check the models of 'Hyundai' for X_train
Hyundai = X_train[X_train['Brand'].str.lower() == 'hyundai']
unique_hyundai_models = sorted(Hyundai['model'].dropna().unique())
unique_hyundai_models

In [None]:
#Check the models of 'Hyundai' for X_train
Hyundai = X_val[X_val['Brand'].str.lower() == 'hyundai']
unique_hyundai_models_v = sorted(Hyundai['model'].dropna().unique())
unique_hyundai_models_v

In [None]:
#Check the models of 'Hyundai' for test_data
Hyundai = test_data[test_data['Brand'].str.lower() == 'hyundai']
unique_hyundai_models = sorted(Hyundai['model'].dropna().unique())
unique_hyundai_models

- Models 'Q2', 'Q3', 'Q5', 'Q7', 'A5' are from 'Audi' not from 'Hyundai'. There are no such models in Hyundai. 
- Here, since the number of observations where the brand is a hyundai and the model is either 'Q3', 'Q5', or 'Q7 is low, it was assumes it was a mistake of the brand name, so the brand was changed from 'Hyundai' to 'Audi'. 

In [None]:
# Replace Hyundai Q7 by Audi Q7 of X_train
mask_train = (X_train["Brand"] == "Hyundai") & (X_train["model"] == "Q7")
X_train.loc[mask_train, "Brand"] = "Audi"
X_train.loc[mask_train, "model"] = "Q7"

# Replace Hyundai Q3 by Audi Q3 of X_val
mask_val = (X_val["Brand"] == "Hyundai") & (X_val["model"] == "Q3")
X_val.loc[mask_val, "Brand"] = "Audi"
X_val.loc[mask_val, "model"] = "Q3"

# Replace Hyundai Q5 by Audi Q5 of X_val
mask_val = (X_val["Brand"] == "Hyundai") & (X_val["model"] == "Q5")
X_val.loc[mask_val, "Brand"] = "Audi"
X_val.loc[mask_val, "model"] = "Q5"

# Replace Hyundai A5 by Audi A5 of test_data
mask_test = (test_data["Brand"] == "Hyundai") & (test_data["model"] == "A5")
test_data.loc[mask_test, "Brand"] = "Audi"
test_data.loc[mask_test, "model"] = "A5"

# Replace Hyundai Q2 by Audi Q2 of test_data
mask_test = (test_data["Brand"] == "Hyundai") & (test_data["model"] == "Q2")
test_data.loc[mask_test, "Brand"] = "Audi"
test_data.loc[mask_test, "model"] = "Q2"

# Replace Hyundai Q3 by Audi Q3 of test_data
mask_test = (test_data["Brand"] == "Hyundai") & (test_data["model"] == "Q3")
test_data.loc[mask_test, "Brand"] = "Audi"
test_data.loc[mask_test, "model"] = "Q3"


**Mercedes**

In [None]:
#Check the models of 'Mercedes' for X_train
Mercedes = X_train[X_train['Brand'].str.lower() == 'mercedes']
unique_mercedes_models = sorted(Mercedes['model'].dropna().unique())
unique_mercedes_models

In [None]:
#Check the models of 'Mercedes' for X_val
Mercedes = X_val[X_val['Brand'].str.lower() == 'mercedes']
unique_mercedes_models_v = sorted(Mercedes['model'].dropna().unique())
unique_mercedes_models_v

In [None]:
#Check the models of 'Mercedes' for test_data
Mercedes = test_data[test_data['Brand'].str.lower() == 'mercedes']
unique_mercedes_models = sorted(Mercedes['model'].dropna().unique())
unique_mercedes_models

- In Mercedes, 200, 220, 230 and 180 are not complete model names, they refer only to the engine/variant, not the class or body style. 
- Therefore, we have decided to replace them by missing values.

In [None]:
#replacing model '230' by missing value of X_train
X_train.loc[X_train["model"] == '230', "model"] = np.nan

#replacing model '200' by missing value of X_val
X_val.loc[X_val["model"] == '200', "model"] = np.nan

#replacing model '220' by missing value of X_val
X_val.loc[X_val["model"] == '220', "model"] = np.nan

#replacing model '180' by missing value of test_data
test_data.loc[test_data["model"] == '180', "model"] = np.nan

**Opel**

In [None]:
#Check the models of 'Opel' for X_train
Opel = X_train[X_train['Brand'].str.lower() == 'opel']
unique_opel_models = sorted(Opel['model'].dropna().unique())
unique_opel_models

In [None]:
#Check the models of 'Opel' for X_val
Opel = X_val[X_val['Brand'].str.lower() == 'opel']
unique_opel_models_v = sorted(Opel['model'].dropna().unique())
unique_opel_models_v

In [None]:
#Check the models of 'Opel' for test_data
Opel = test_data[test_data['Brand'].str.lower() == 'opel']
unique_opel_models = sorted(Opel['model'].dropna().unique())
unique_opel_models

- 'Kadjar' is a 'Renault' model not a 'Opel' model.
- Since the brand 'Renault' is not in our dataset, we have decided to replace this model by missing values. 

In [None]:
#replacing model 'Kadjar' by missing value
X_train.loc[X_train["model"] == 'Kadjar', "model"] = np.nan
X_val.loc[X_val["model"] == 'Kadjar', "model"] = np.nan

**Skoda**

In [None]:
#Check the models of 'Skoda' for X_train
Skoda = X_train[X_train['Brand'].str.lower() == 'skoda']
unique_skoda_models = sorted(Skoda['model'].dropna().unique())
unique_skoda_models

In [None]:
#Check the models of 'Skoda' for X_val
Skoda = X_val[X_val['Brand'].str.lower() == 'skoda']
unique_skoda_models_v = sorted(Skoda['model'].dropna().unique())
unique_skoda_models_v

In [None]:
#Check the models of 'Skoda' for test_data
Skoda = test_data[test_data['Brand'].str.lower() == 'skoda']
unique_skoda_models = sorted(Skoda['model'].dropna().unique())
unique_skoda_models

- All the models showed seem to correspond to the brand 'Skoda'.

**VW**

In [None]:
#Check the models of 'VW' fro X_train
VW = X_train[X_train['Brand'].str.lower() == 'vw']
unique_vw_models = sorted(VW['model'].dropna().unique())
unique_vw_models

In [None]:
#Check the models of 'VW' for X_val
VW = X_val[X_val['Brand'].str.lower() == 'vw']
unique_vw_models_v = sorted(VW['model'].dropna().unique())
unique_vw_models_v

In [None]:
#Check the models of 'VW' for test_data
VW = test_data[test_data['Brand'].str.lower() == 'vw']
unique_vw_models = sorted(VW['model'].dropna().unique())
unique_vw_models

In [None]:
#replacing 'Shara' by the correct name for test_data
test_data.loc[test_data["model"] == 'Shara', "model"] = 'Sharan'

- In x_train and X_val, all the models showed seem to correspond to the brand 'VW'.
- In test_data, *Shara* is not the right VW model, the correct would be 'Sharan'. 

#### 4.4.2. Numerical Variables 

##### 4.4.2.1. Correcting variables' incoherent values

**Previous Owners**

In [None]:
#checking negative values for X_train
X_train[X_train['previousOwners'] < 0]['previousOwners']

In [None]:
#checking negative values for X_val 
X_val[X_val['previousOwners'] < 0]['previousOwners']

In [None]:
#checking negative values for test_data
test_data[test_data['previousOwners'] < 0]['previousOwners']

- The training dataset has 265 cars with negative owners and the validation set has 106, all equal to -2, which is not possible. - Since the mean value of the previousOwners is 1.994580 and the median is 2, we made the assumption that these negative values are spelling mistakes made during the data collection, that added the '-'. Therefore, we decided the change all of the negative values to positive.

In [None]:
#replacing the negative values by their module.
X_train['previousOwners'] = X_train['previousOwners'].replace(-2, 2)
X_val['previousOwners'] = X_val['previousOwners'].replace(-2, 2)
test_data['previousOwners'] = test_data['previousOwners'].replace(-2, 2)

**Milage**

In [None]:
#checking negative observations for X_train 
X_train[X_train['mileage'] < 0]

In [None]:
#checking negative observations for X_val
X_val[X_val['mileage'] < 0]

In [None]:
#checking negative observations for test_data
test_data[test_data['mileage'] < 0]

- Here we have 247 negative values in the training set and 122 in the validation. 
- Since, it appers that these negative numbers do not have any kind of relationship, we decided to convert them into missing values to fill them in later.

In [None]:
#replacing negative values by missing values 
X_train.loc[X_train["mileage"] < 0, "mileage"] = np.nan
X_val.loc[X_val["mileage"] < 0, "mileage"] = np.nan
test_data.loc[test_data["mileage"] < 0, "mileage"] = np.nan

**Mpg**

In [None]:
#checking negative observations for X_train 
X_train[X_train['mpg'] < 0]

In [None]:
#checking negative observations for X_val
X_val[X_val['mpg'] < 0]

In [None]:
#checking negative observations for tes_data
test_data[test_data['mpg'] < 0]

- We can observe that negative values are all the same. In the context of the variable 'mpg', it is not normal to have dozens of cars with the exact same number of milles per gallon, since the car's consumption are dependent on a lot of factors such as the driver, the age of the car and the way the car is used. We have also noticed that all of the cars with negative values are BMWs, so it was probably an error related to the brand. 
- Therefore we decided to replace them with missing values to fill them afterwards.

In [None]:
#replacing negative values by missing values 
X_train.loc[X_train["mpg"] < 0, "mpg"] = np.nan
X_val.loc[X_val["mpg"] < 0, "mpg"] = np.nan
test_data.loc[test_data["mpg"] < 0, "mpg"] = np.nan

In [None]:
#checking electric cars for X_train
X_train[X_train['fuelType'] == 'Electric']

In [None]:
#checking electric cars for X_val
X_val[X_val['fuelType'] == 'Electric']

In [None]:
#checking electric cars for test_data
test_data[test_data['fuelType'] == 'Electric']

- Also it does not make sense for an electric car to have the variable *mpg* since electric cars do not use petrol, so “miles per gallon” is meaningless.
- Therefore, we have decided to replace *mpg* of electric cars by 0.

In [None]:
#changing the mpg of electric cars to 0 for X_train and test_data
X_train.loc[X_train['fuelType'] == 'Electric', 'mpg'] = 0
test_data.loc[test_data['fuelType'] == 'Electric', 'mpg'] = 0

**Engine Size**

In [None]:
#checking obervations where mpg is lower than 0.5 for X_train 
X_train[X_train['engineSize'] < 0.5]

In [None]:
#checking obervations where mpg is lower than 0.5 for X_val 
X_val[X_val['engineSize'] < 0.5]

In [None]:
#checking obervations where mpg is lower than 0.5 for test_data 
test_data[test_data['engineSize'] < 0.5]

- The negative values of EngineSize are all equal and they all belong to a ford. However, the model is different. It is not common for these models to have the same engine size, therefore, we have decided to replace by missing values. 
- In addition, we have rows where the value is less than 0.5. In those cases, the numbers are changed to missing values as it is highly unlikely to have cars with engine size smaller than 0.5.

In [None]:
#replacing negative values by missing values 
X_train.loc[X_train["engineSize"] < 0.5, "engineSize"] = np.nan
X_val.loc[X_val["engineSize"] < 0.5, "engineSize"] = np.nan
test_data.loc[test_data["engineSize"] < 0.5, "engineSize"] = np.nan

**Tax**

In [None]:
#checking negative observations for X_train 
X_train[X_train['tax'] < 0]

In [None]:
#checking negative observations for X_val
X_val[X_val['tax'] < 0]

In [None]:
#checking negative observations for test_data
test_data[test_data['tax'] < 0]

- Even tough 'tax' has different values for different cars, it can never be negative, because that would mean the government is paying the owner to own the car, which is impossible. 
- We have decided to replace these values by missing values. 

In [None]:
#replacing negative values by missing values 
X_train.loc[X_train["tax"] < 0, "tax"] = np.nan
X_val.loc[X_val["tax"] < 0, "tax"] = np.nan
test_data.loc[test_data["tax"] < 0, "tax"] = np.nan

**Paint Quality (%)**

- According to project guidelines, we should be able to "create a predictive model capable of evaluating the price of a car based on the user’s input without needing the car to be taken to a mechanic."
- As *paintQuality%* is " The mechanic’s assessment of the cars’ overall paint quality and hull integrity (filled by the mechanic during evaluation).", this variable is not considered valid and accurate to predict the model, so it would not be considered from now on. 

In [None]:
X_train.drop('paintQuality%', axis = 1, inplace = True)
X_val.drop('paintQuality%', axis = 1, inplace = True)
test_data.drop('paintQuality%', axis = 1, inplace = True)

X_train.head(3)

**Year**

In [None]:
#checking values higher than 2020 for X_train 
X_train[X_train['year'] > 2020]

In [None]:
#checking values higher than 2020 for X_val 
X_val[X_val['year'] > 2020]

In [None]:
#checking values higher than 2020 for test_data
test_data[test_data['year'] > 2020]

- Since the dataset we are analysing is from 2020, it does not make sense to have years after 2020. Noting that the only years that appear in the dataset after 2020 are 2023 and 2024.
- Therefore, we have decided to replace these years by missing values, assumming there was an error in the system. These years could be 2013 and 2014 instead. 

In [None]:
#replacing years after 2020 by missing values 
X_train.loc[X_train["year"] > 2020, "year"] = np.nan
X_val.loc[X_val["year"] > 2020, "year"] = np.nan
test_data.loc[test_data["year"] > 2020, "year"] = np.nan

### 4.5. Removing Duplicates

In [None]:
#checking duplicates again after handling incoherencies on X_train
int(X_train.duplicated().sum())

In [None]:
#checking duplicates again after handling incoherencies on X_val
int(X_val.duplicated().sum())

In [None]:
#droping duplicate rows of X_train and X_val
X_train = X_train.drop_duplicates()
X_val = X_val.drop_duplicates()

### 4.6. Treating Outliers

In [None]:
#redifining metric features after not considering 'paintQuality%'
metric_features = ['year', 'mileage', 'tax', 'mpg',
                    'engineSize', 'previousOwners', 'hasDamage']

In [None]:
#checking outliers again after handling incoherencies on X_train
plot_multiple_boxplots(X_train, metric_features, title="Train Set: Numeric Variables' Box Plots")

In [None]:
#checking outliers again after handling incoherencies on X_train
plot_multiple_boxplots(X_val, metric_features, title="Train Set: Numeric Variables' Box Plots")

In [None]:
#checking outliers again after handling incoherencies on test_data
plot_multiple_boxplots(test_data, metric_features, title="Train Set: Numeric Variables' Box Plots")

**Year**

In [None]:
X_train[X_train['year'] < 1995]

In [None]:
X_val[X_val['year'] < 1995]

In [None]:
test_data[test_data['year'] < 1995]

In [None]:
#replacing years before 1995 by missing values for X_train and test_data
X_train.loc[X_train["year"] < 1995, "year"] = np.nan
test_data.loc[test_data["year"] < 1995, "year"] = np.nan

**Mileage**

In [None]:
X_train[X_train['mileage'] > 150000]

In [None]:
X_val[X_val['mileage'] > 150000]

In [None]:
test_data[test_data['mileage'] > 150000]

In [None]:
#replacing mileages higher than 150000 by missing values
X_train.loc[X_train["mileage"] > 150000, "mileage"] = np.nan
X_val.loc[X_val["mileage"] > 150000, "mileage"] = np.nan
test_data.loc[test_data["mileage"] > 150000, "mileage"] = np.nan

**Tax**

In [None]:
X_train[X_train['tax'] < 50]

- Since the number of cars with *tax* less than 100 is large, these we will not be changed. 

In [None]:
#the same as 400
X_train[X_train['tax'] > 350]

In [None]:
X_val[X_val['tax'] > 350]

In [None]:
test_data[test_data['tax'] > 350]

In [None]:
#replacing taxes higher than 350 by missing values
X_train.loc[X_train["tax"] > 350, "tax"] = np.nan
X_val.loc[X_val["tax"] > 350, "tax"] = np.nan
test_data.loc[test_data["tax"] > 350, "tax"] = np.nan

**Mpg**

In [None]:
X_train[X_train['mpg'] > 300]

In [None]:
X_val[X_val['mpg'] > 300]

In [None]:
test_data[test_data['mpg'] > 300]

- All mpg > 300 are from BMW I3 and they all have the same mpg.

In [None]:

#replacing mpg higher than 300 by missing values
X_train.loc[X_train["mpg"] > 300, "mpg"] = np.nan
X_val.loc[X_val["mpg"] > 300, "mpg"] = np.nan
test_data.loc[test_data["mpg"] > 300, "mpg"] = np.nan

**EngineSize**

In [None]:
X_train[X_train['engineSize'] > 5]

In [None]:
X_val[X_val['engineSize'] > 5]

In [None]:
test_data[test_data['engineSize'] > 5]

In [None]:
#replacing mileages higher than 150000 by missing values
X_train.loc[X_train["engineSize"] > 5, "engineSize"] = np.nan
X_val.loc[X_val["engineSize"] > 5, "engineSize"] = np.nan
test_data.loc[test_data["engineSize"] > 5, "engineSize"] = np.nan

### 4.7. Treating Missing Values

#### 4.7.1. Numerical Variables 

**KNN Imputer**

In [None]:
X_train[metric_features].isna().sum().sort_values(ascending=False)

In [None]:
X_val[metric_features].isna().sum().sort_values(ascending=False)

In [None]:
test_data[metric_features].isna().sum().sort_values(ascending=False)

- Most of our variables are MAR (missing at random), which means that missingness depends on known variables. For example, we can imply that milage depends on brand, year and engine size; mpg depends on fuel type and model; tax depends on fuel type and year, etc.
- This means that the "correct" value for a missing record would not reply on the global mean, but depends on similiar cars. Therefore, the KNN imputer method is a good solution for this type of problem since it is capable of dealing with MAR, while capturing non linear relationships. 

In [None]:
freq_encoding_brand = X_train['Brand'].value_counts()
X_train['brand_encoded'] = X_train['Brand'].map(freq_encoding_brand)
X_val['brand_encoded'] = X_val['Brand'].map(freq_encoding_brand)
test_data['brand_encoded'] = test_data['Brand'].map(freq_encoding_brand)

In [None]:
freq_encoding_model = X_train['model'].value_counts()
X_train['model_encoded'] = X_train['model'].map(freq_encoding_model)
X_val['model_encoded'] = X_val['model'].map(freq_encoding_model)
test_data['model_encoded'] = test_data['model'].map(freq_encoding_model)

In [None]:
freq_encoding_transmission = X_train['transmission'].value_counts()
X_train['transmission_encoded'] = X_train['transmission'].map(freq_encoding_transmission)
X_val['transmission_encoded'] = X_val['transmission'].map(freq_encoding_transmission)
test_data['transmission_encoded'] = test_data['transmission'].map(freq_encoding_transmission)

In [None]:
print(X_train["fuelType"].value_counts())
print(X_val["fuelType"].value_counts())
print(test_data["fuelType"].value_counts())

In [None]:
rare_fuels = ['Electric', 'Other']
X_train['fuelType'] = X_train['fuelType'].replace(rare_fuels, 'Other')
X_val['fuelType'] = X_val['fuelType'].replace(rare_fuels, 'Other')
test_data['fuelType'] = test_data['fuelType'].replace(rare_fuels, 'Other')

freq_encoding_fuel = X_train['fuelType'].value_counts() 
X_train['fuelType_encoded'] = X_train['fuelType'].map(freq_encoding_fuel)
X_val['fuelType_encoded'] = X_val['fuelType'].map(freq_encoding_fuel)
test_data['fuelType_encoded'] = test_data['fuelType'].map(freq_encoding_fuel)

In [None]:
X_train = X_train.drop(['Brand', 'model', 'transmission', 'fuelType'], axis=1)
X_val = X_val.drop(['Brand', 'model', 'transmission', 'fuelType'], axis=1)
test_data = test_data.drop(['Brand', 'model', 'transmission', 'fuelType'], axis=1)

In [None]:
# ensuring that y_train has the same rows X_train
y_train = y_train.loc[X_train.index]
y_val = y_val.loc[X_val.index]

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
test_data.isna().sum().sort_values(ascending=False)

In [None]:
#scalling X_train and X_val since KNN imputer uses distances

#numerical variables whose missing values need to be filled
numeric_cols = ['tax', 'mpg', 'mileage', 'engineSize', 'hasDamage', 'previousOwners', 'year', 'model_encoded', 'brand_encoded', 'fuelType_encoded', 'transmission_encoded']
scaler = MinMaxScaler()

# fitting the scaler only on training dataset (to avoid leakage)
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

# transforming validation dataset using the same scaling
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])


In [None]:
knn = KNNImputer(n_neighbors=5, weights='distance')

X_train[numeric_cols] = knn.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = knn.transform(X_val[numeric_cols])
test_data[numeric_cols] = knn.transform(test_data[numeric_cols])

In [None]:
X_train.isna().sum().sort_values(ascending=False)

In [None]:
X_val.isna().sum().sort_values(ascending=False)

In [None]:
test_data.isna().sum().sort_values(ascending=False)

#### 4.7.2. Categorical Variables

**Filling with the mode**

In [None]:
X_train[non_metric_features].isna().sum().sort_values(ascending=False)

In [None]:
X_val[non_metric_features].isna().sum().sort_values(ascending=False)

In [None]:
test_data[non_metric_features].isna().sum().sort_values(ascending=False)

In [None]:
cat_imputer = SimpleImputer(strategy='most_frequent')

X_train.loc[:, non_metric_features] = cat_imputer.fit_transform(X_train[non_metric_features])
X_val.loc[:, non_metric_features] = cat_imputer.transform(X_val[non_metric_features])
test_data.loc[:, non_metric_features] = cat_imputer.transform(test_data[non_metric_features])

In [None]:
X_train[non_metric_features].isna().sum().sort_values(ascending=False)

In [None]:
X_val[non_metric_features].isna().sum().sort_values(ascending=False)

In [None]:
test_data[non_metric_features].isna().sum().sort_values(ascending=False)

### 4.8. Feature Engineering

#### 4.8.1. Creating New Variables

In [None]:
X_train['engineEfficiency'] = X_train['mpg'] / X_train['engineSize']
X_val['engineEfficiency'] = X_val['mpg'] / X_val['engineSize']
test_data['engineEfficiency'] = test_data['mpg'] / test_data['engineSize']

#### 4.8.2. Encoding Categorical Features

- Frequency encoding is used to transform the variables Brand and model into numerical, as these have a high cardinality, while transmission and fuelType was transformed into dummy variables.

In [None]:
freq_encoding_brand = X_train['Brand'].value_counts()
mean_freq_brand = freq_encoding_brand.mean()
X_train['brand_encoded'] = X_train['Brand'].map(freq_encoding_brand)
X_val['brand_encoded'] = X_val['Brand'].map(freq_encoding_brand).fillna(mean_freq_brand)
test_data['brand_encoded'] = test_data['Brand'].map(freq_encoding_brand).fillna(mean_freq_brand)


In [None]:
freq_encoding_model = X_train['model'].value_counts()
mean_freq_model = freq_encoding_model.mean()
X_train['model_encoded'] = X_train['model'].map(freq_encoding_model)
X_val['model_encoded'] = X_val['model'].map(freq_encoding_model).fillna(mean_freq_model)
test_data['model_encoded'] = test_data['model'].map(freq_encoding_model).fillna(mean_freq_model)

In [None]:
freq_encoding_transmission = X_train['transmission'].value_counts()
mean_freq_transmission = freq_encoding_transmission.mean()
X_train['transmission_encoded'] = X_train['transmission'].map(freq_encoding_transmission)
X_val['transmission_encoded'] = X_val['transmission'].map(freq_encoding_transmission).fillna(mean_freq_transmission)
test_data['transmission_encoded'] = test_data['transmission'].map(freq_encoding_transmission).fillna(mean_freq_transmission)

- We observed that the fuelType variable contained, only 4 Electric cars, which is an extremely low representation for a category, and 46 rows defined as "other". To prevent overfitting and unstable coefficients, these categories were grouped together into the Other class.

In [None]:
print(X_train["fuelType"].value_counts())
print(X_val["fuelType"].value_counts())
print(test_data["fuelType"].value_counts())

In [None]:
rare_fuels = ['Electric', 'Other']
X_train['fuelType'] = X_train['fuelType'].replace(rare_fuels, 'Other')
X_val['fuelType'] = X_val['fuelType'].replace(rare_fuels, 'Other')
test_data['fuelType'] = test_data['fuelType'].replace(rare_fuels, 'Other')

freq_encoding_fuel = X_train['fuelType'].value_counts() 
mean_freq_fuel = freq_encoding_fuel.mean()
X_train['fuelType_encoded'] = X_train['fuelType'].map(freq_encoding_fuel)
X_val['fuelType_encoded'] = X_val['fuelType'].map(freq_encoding_fuel).fillna(mean_freq_fuel)
test_data['fuelType_encoded'] = test_data['fuelType'].map(freq_encoding_fuel).fillna(mean_freq_fuel)

In [None]:
X_train.info()

In [None]:
X_val.isna().sum().sort_values(ascending=False)

In [None]:
test_data.isna().sum().sort_values(ascending=False)

In [None]:
X_train = X_train.drop(['Brand', 'model', 'transmission', 'fuelType'], axis=1)
X_val = X_val.drop(['Brand', 'model', 'transmission', 'fuelType'], axis=1)
test_data = test_data.drop(['Brand', 'model', 'transmission', 'fuelType'], axis=1)

In [None]:
# ensuring that y_train has the same rows X_train
y_train = y_train.loc[X_train.index]
y_val = y_val.loc[X_val.index]

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

### 4.9. Scaling

In [None]:
#checking if the variable created has some infinite values
print(np.isinf(X_train["engineEfficiency"]).sum())
print(np.isinf(X_val["engineEfficiency"]).sum())
print(np.isinf(test_data["engineEfficiency"]).sum())

In [None]:
X_train.head()

In [None]:
#replace infinite by missing values
X_train = X_train.replace({"engineEfficiency": {np.inf: np.nan, -np.inf: np.nan}})
X_val = X_val.replace({"engineEfficiency": {np.inf: np.nan, -np.inf: np.nan}})
test_data = test_data.replace({"engineEfficiency": {np.inf: np.nan, -np.inf: np.nan}})


In [None]:
#fill the missing values using the mean
mean_engineEfficiency = X_train['engineEfficiency'].mean()
X_train['engineEfficiency'] = X_train['engineEfficiency'].fillna(mean_engineEfficiency)
X_val['engineEfficiency'] = X_val['engineEfficiency'].fillna(mean_engineEfficiency)
test_data['engineEfficiency'] = test_data['engineEfficiency'].fillna(mean_engineEfficiency)


In [None]:
X_train.isnull().sum()



In [None]:
X_val.isnull().sum()


In [None]:
test_data.isnull().sum()

- We replaced the infinites by missing values and fill them because, if there were infinites in *engineEfficiency*, the scaled would not work. 

In [None]:
#variables that are not scalled yet
rest_var = [ 'engineEfficiency']

# fitting the scaler only on training dataset (to avoid leakage)
X_train[rest_var] = scaler.fit_transform(X_train[rest_var])

# transforming validation dataset using the same scaling
X_val[rest_var] = scaler.transform(X_val[rest_var])
test_data[rest_var] = scaler.transform(test_data[rest_var])

In [None]:
#checking if X_train and y_train have the same number of rows
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
#checking if X_val and y_val have the same number of rows
print("X_val shape:", X_train.shape)
print("y_val shape:", y_train.shape)

In [None]:
test_data.isna().sum().sort_values(ascending=False)

In [None]:
X_train.head()

In [None]:
X_val.head()

In [None]:
test_data.head()

### 4.10. Feature Selection

In order to correctly predict the price of the cars in our test dataset, it is important to chose a group of features that would be sufficient relevant to train the model. The choice of what features to keep was taken based on the analysis of filter, wrapper and embedded methods. The filter methods developed were the *Variance*, the *Spearman Correlation Matrix* and the *Mutual Information (MI)*, while in terms of wrapper methods, the *RFECV* and the *Sequential Feature Selector* with *Forward Selection* were the two methods chosen. In addition, *Lasso* and *Random Forest Feature Importance* were also taken into consideration as part of embedded methods. 
We have decided to analyse and retrieving insights of each method individually and, at the end, grouping all information in one table, it is could be easier to understan what features should be kept. 
In feature selection, we are only going to consider *X_train* and *X_val*.

#### 4.10.1. Filter Methods 
This methods are caracterized by being statistical approaches that do not consider any machine learning algorithm. Our objective is to compare our features against the target variable (*price*). However, these methods solely are not enough to conduct a good feature selection due to the fact that we are only looking to correlations between two variables at a time. There are cases where a variable can be "jointly significant". 

##### 4.10.1.1. Variance 
First, we looked at the variance of each variable to understand if there are variables whose variance is null or almost null. If a variable is null, it means it is irrelavant for training the model because all their observations are equal or similiar, not adding any new information. 

In [None]:
X_train.var()

In [None]:
X_val.var()

As previously seen, *hasDamaged* has a variance of 0, which is an indicator that would not be kept in the model. None of the remain variables presents a variance near 0, so according to this method, all variables should be kept except for **hasDamaged**.

##### 4.10.1.2. Spearman correlation

In [None]:
# Combine X_train and y_train into one dataframe
df_train = X_train.copy()
df_train['price'] = y_train.values  # ensure alignment

# Compute Spearman correlation
cor_spearman = df_train.corr(method='spearman')
cor_spearman

In [None]:
# Create mask for upper triangle
mask = np.triu(np.ones_like(cor_spearman, dtype=bool))

# White style
sns.set_theme(style="white")

# Plot
plt.figure(figsize=(16, 12))

sns.heatmap(
    cor_spearman,
    mask=mask,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)

plt.title("Correlation Matrix Including Price", fontsize=18, pad=20)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

As already seen in data exploration, the only two couple of features that have a high correlation (<|0.8|) are *mileage* vs *year* and *engineEfficiency* vs *engineSize*, which present a negative correlation of -0.79 and -0.90, respectively. It is important to note that, even though, 
these correlations are usually a signal of redundance, all of them have good correlations with *price* (either positives or negatives correlations), and correlation matrixes only give information about linear relationships, which, as seen before, do not have a high predominance in our dataset. For that reason, according to this analysis, this variables should be kept in our model.  
In addition, *previousOwners*, *hasDamaged* and *fuelType_Other* has no correlation or very few correlation with other variables, including *price*. 
This indicates these features are not relevant to predict our target.
Therefore, according to this method, all variables should be kept except for **hasDamaged** and **previousOwners**.

##### 4.10.1.3. Mutual Information (MI)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
mi_scores = mutual_info_regression(X_train, y_train, random_state=0)

mi_df = pd.DataFrame({
    'Feature': X_train.columns,
    'MI_Score': mi_scores
}).sort_values(by='MI_Score', ascending=False)

print(mi_df.head(10))

top_mi_features = mi_df.head(15)['Feature']
X_train_mi = X_train[top_mi_features]

In [None]:
mi_df = mi_df.sort_values(by='MI_Score', ascending=False).reset_index(drop=True)
mi_df['MI_Score_norm'] = mi_df['MI_Score'] / mi_df['MI_Score'].max()

# Plot clean MI curve
plt.figure(figsize=(10, 6))
plt.plot(mi_df['Feature'], mi_df['MI_Score_norm'], marker='o', color='steelblue', linewidth=2)
plt.title("Mutual Information Scores — Feature Importance", fontsize=14, fontweight='bold')
plt.xlabel("Feature", fontsize=12)
plt.ylabel("Normalized MI Score", fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Considering that *Spearman Correlation Matrix* only analysis linear relationships, we have decided to perform the *Mutual Information (MI)* that gives us information about nonlinear predictive power of features. 
The MI scores show that *engineEfficiency* and *model_encode*d are the two most informative predictors, followed by *mpg*, *engineSize*, *year*, *brand_encoded* and *mileage*.
After the top 7 features, the MI values either flatten considerably or the Normalized MI score is low (<0.4), indicating that additional variables give us little new information to predict *price*. 
Therefore, according to this method, the variables **previousOwners**, **hasDamaged**, **fuelType_encoded**, **transmission_encoded** and **tax** should not be consider when training the model

#### 4.10.2. Wrapper Methods 

##### 4.10.2.1. RFECV

In [None]:
# Combine train and validation datasets
X_combined = np.concatenate([X_train, X_val])
y_combined = np.concatenate([y_train, y_val])

# Create a test fold index (-1 for train, 0 for validation)
test_fold = [-1] * len(X_train) + [0] * len(X_val)

print('Test fold: ', len(test_fold))
print('X_combined: ', len(X_combined))
print('y_combined: ', len(y_combined))

# Define the PredefinedSplit
ps = PredefinedSplit(test_fold=test_fold)

In [None]:
# preprune random forest

base_estimator = RandomForestRegressor(
    n_estimators=200,   # number of trees
    max_depth=10,
    min_samples_leaf = 4,
    max_features="sqrt",
    min_impurity_decrease = 0.001,
    random_state=0,
    n_jobs=-1
)

In [None]:
'''base_estimator = RandomForestRegressor(
    n_estimators=400,
    max_depth=10,
    min_samples_split=12,
    min_samples_leaf=6,
    max_features=0.6,
    random_state=42,
    n_jobs=-1
)'''

tentamos este, mas apesar de dar melhores resultados, estava a dar mais overfitting

In [None]:
# Find number of features
neg_mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# 3. Execute RFECV to find the optimal feature count based on Validation RMSE
# Using the balanced estimator for the primary optimization process.
rfecv_selector = RFECV(
    estimator=base_estimator, 
    step=1,
    cv=ps, # Uses the PredefinedSplit (X_val) for scoring
    scoring=neg_mse_scorer
)
rfecv_selector.fit(X_combined, y_combined)

# --- Define the range of features evaluated by RFECV (for the X-axis) ---
feature_counts_range = rfecv_selector.cv_results_['n_features']

# 4. Calculate Training RMSE for the plot (MANUAL LOOP CORRECTED)
# We calculate the Training score for each feature subset size determined by the ranking.
train_rmse_list = []
feature_ranking = rfecv_selector.ranking_

for n in feature_counts_range:
    # Identify the features selected at this step (rank <= n)
    selected_features_mask = feature_ranking <= n
    
    # Subset X_train using the mask, assuming X_train is a Pandas DataFrame.
    X_train_subset = X_train.iloc[:, selected_features_mask]
    
    # Train on the SUBSET of features
    base_estimator.fit(X_train_subset, y_train)
    
    # Predict and calculate Training RMSE on the SUBSET of features
    train_pred = base_estimator.predict(X_train_subset)
    train_mse = mean_squared_error(y_train, train_pred)
    train_rmse_list.append(np.sqrt(train_mse))


# 5. Prepare Plotting Data and Output
optimal_nof = rfecv_selector.n_features_
validation_scores_neg_mse = rfecv_selector.cv_results_['mean_test_score']
# Convert Validation Negative MSE scores to positive RMSE
validation_rmse = np.sqrt(-validation_scores_neg_mse) 
# The index of the optimal number of features is optimal_nof - 1
min_rmse = validation_rmse[optimal_nof - 1] 

# Variables for Plotting
nof_list = feature_counts_range
val_rmse_list = validation_rmse
nof = optimal_nof
low_rmse = min_rmse

print("\n--- RFECV Results (Using Random Forest) ---")
print(f"Optimal number of features found: {nof}")
print(f"Minimum Validation RMSE achieved: {low_rmse:.4f}")

In [None]:
# Plot figure

plt.figure(figsize=(10, 6))

# Training RMSE (NEWLY ADDED)
plt.plot(nof_list, train_rmse_list,
         label="RMSE on Training Set", color='yellowgreen', linewidth=2) 
         
# Validation RMSE
plt.plot(nof_list, val_rmse_list, 
         label="RMSE on Validation Set", color='dimgray', linewidth=2) 

# Highlight the optimal point
plt.plot(nof, low_rmse, 'ro', 
         label=f'Optimal Features: {nof} (RMSE: {low_rmse:.4f})', markersize=8)

plt.xlabel("Number of Features Selected")
plt.ylabel("RMSE (Root Mean Squared Error)") 
plt.title("RFECV Performance: Training vs. Validation Error")
plt.legend(loc='upper right')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

In [None]:
# Get the mask of selected features (True for selected, False otherwise)
# rfecv_selector.support_ stores the mask corresponding to the optimal_nof
selected_feature_mask = rfecv_selector.support_

forward_features = X_train.columns[selected_feature_mask]

print("Selected features using RFECV ({} features):".format(optimal_nof))
print(list(forward_features))

According to *RFECV* with *Random Forest* as estimator, the optimal number of features to keep would be 10 as this combination of features reveal the lowest validation RMSE. The minimum score was 3043.1847, indicating that the model’s average prediction error is about £2938.
Therefore, according to this method, the variables to be discarded are **previousOwners** and **hasDamaged**

##### 4.10.2.2. Sequential Forward Feature Selector

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

#forward selection

sfs_forward = SequentialFeatureSelector(
    estimator=base_estimator,
    n_features_to_select="auto",   # Let SFS find optimal count
    direction="forward",
    scoring="neg_mean_squared_error",
    cv=ps,                          # small CV to reduce compute
    n_jobs=-1,
    tol=1e-3  
)

sfs_forward.fit(X_combined, y_combined)

# Selected feature mask and names
forward_mask = sfs_forward.get_support()
forward_features = X_train.columns[forward_mask]

print("Forward Selected Features:")
print(list(forward_features))

# Evaluate on validation set
base_estimator.fit(X_train[forward_features], y_train)
y_val_pred_forward = base_estimator.predict(X_val[forward_features])
val_rmse_forward = np.sqrt(mean_squared_error(y_val, y_val_pred_forward))

# Calculate Training RMSE 
y_train_pred_forward = base_estimator.predict(X_train[forward_features])
train_rmse_forward = np.sqrt(mean_squared_error(y_train, y_train_pred_forward))

print("Training RMSE (Forward SFS):", train_rmse_forward)
print("Validation RMSE (Forward SFS):", val_rmse_forward)

In constract with *RFECV*, *Sequential Forward Feature Selection* stated that the optimal number of features would only be 5. 
Therefore, according to this method, the features that should be removed are **previousOwners**, **fuelType_encoded**, **hasDamaged**, **transmission_encoded**, **tax** and **mileage**, **engineSize**

##### 4.10.2.2. Sequential Backward Feature Selection

In [None]:
# Backward Selection
sfs_backward = SequentialFeatureSelector(
    estimator=base_estimator,
    n_features_to_select="auto", # Let SFS find optimal count
    direction="backward",        # Key change for Backward Selection
    scoring="neg_mean_squared_error",
    cv=ps,                       # Use the same PredefinedSplit
    n_jobs=-1,
    tol=1e-3
)

sfs_backward.fit(X_combined, y_combined)

# Selected feature mask and names
backward_mask = sfs_backward.get_support()
backward_features = X_train.columns[backward_mask] 

print("\nBackward Selected Features:")
print(list(backward_features))

# Evaluate on validation set
# We must use a fresh fit of the base_estimator on the optimally selected features
base_estimator.fit(X_train[backward_features], y_train)
y_val_pred_backward = base_estimator.predict(X_val[backward_features])
val_rmse_backward = np.sqrt(mean_squared_error(y_val, y_val_pred_backward))

# Calculate Training RMSE 
y_train_pred_backward = base_estimator.predict(X_train[backward_features])
train_rmse_backward = np.sqrt(mean_squared_error(y_train, y_train_pred_backward))

print("Training RMSE (Backward SFS):", train_rmse_backward)
print("Validation RMSE (Backward SFS):", val_rmse_backward)


#### 4.10.3. Embedded Methods 
As a way of complementing filter and wrapper methods, we have decided to also analyse some embedded methods since learn which features best contribute to the accuracy of the model while the model is being created. 

##### 4.10.3. Random Forest Feature Importance 

In [None]:
rf = RandomForestRegressor(
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

rf_df = (
    pd.DataFrame({
        "feature": X_train.columns,
        "rf_importance": rf.feature_importances_
    })
    .sort_values("rf_importance", ascending=False)
)

print("\n===== Random Forest Feature Importance =====\n")
print(rf_df)


Unlikely wrapper methods, *Random Forest Feature Importance* does not directly give us a set of optimal features, it inform us how much each feature contributes to improving the model’s predictions by reducing the variance of the target variable (price). When analysing the output of *RFFI*, it is possible to understand that importance values are relative, all the features sum to 1.0. This indicates that, the features that contribute the most for model’s predictive power are *transmission_encoded*, *engineEfficiency* and *year* with 27%, 21.9% and 18.9%, respectively. According to this method, these are the features that are frequently used to split nodes across many trees and the one the model relies heavily on to make predictions. On the other hand, *hasDamaged*, *previousOwners* and *hasDamage* are the features used very little (or not at all) in the trees, which means they do not reduce the variance of price significantly. In order to decide what features to keep or discard based on this method, our reference number was 0.01, since every feature whose importance is below this value, does not add enough information to the model. Thus, based on this method, every feature should be kept except for **previousOwners**, **hasDamaged**, **fuelType_encoded** and **tax**.

### Final Insights
Based on all the methods used and their individual conclusions, we have decided to build a table where for each feature, we analyse whether to keep or discard based on the method. Our approach to make this decision is based on whether the feature has 2 or more methods where was stated "discard". In case of having it 2 time, we would test the model with and without that feature, while the ones that have it more than 2 times would be automatically discarded. This implies that the features that have none or one "discard" are automatically included in the model. Moreover, considering that *RFECV* and *SFFS* are methods that provide an optimal number of features based on a specific set of variables, we have decided to also test the model with these groups of features, independently of the remain methods.  

| Predictor            | Variance | Spearman Correlation| Mutual Information | RFECV (Random Forest) | SFFS  | SBFS  | RFFI | Decision              |
|----------------------|----------|---------------|---------------------|----------|------------------------|-------------|--------------|------------------------|
| *year*                 | Keep     | Keep           | Keep           | Keep     | Keep               | Keep        | Keep         | Include in the model   |
| *mileage*              | Keep     | Keep           | Keep           | Keep     | Keep               | Keep        | Keep         | Include in the model   |
| *tax*                  | Keep     | Keep           | Discard        | Keep     | Discard            | Discard     | Discard      | Discard   |
| *mpg*                  | Keep     | Keep           | Keep           | Keep     | Keep               | Keep        | Keep         | Include in the model   |
| *engineSize*           | Keep     | Keep           | Keep           | Keep     | Keep               | Keep        | Keep         | Include in the model   |
| *engineEfficiency*     | Keep     | Keep           | Keep           | Keep     | Keep               | Keep        | Keep         | Include in the model   |
| *brand_encoded*        | Keep     | Keep           | Keep           | Keep     | Keep               | Keep        | Keep         | Include in the model   |
| *model_encoded*        | Keep     | Keep           | Keep           | Keep     | Keep               | Keep        | Keep         | Include in the model   |
| *transmission_encoded* | Keep     | Keep           | Discard        | Keep     | Keep               | Keep        | Keep         | Try with and without |
| *fuelType_encoded*     | Keep     | Keep           | Discard        | Keep     | Keep               | Keep        | Discard      | Discard    |
| *previousOwners*       | Keep     | Discard        | Discard        | Discard  | Keep               | Discard     | Discard      | Discard    |
| *hasDamage*            | Discard  | Discard        | Discard        | Discard  | Discard            | Keep        | Discard      | Discard    |


Based on our strategy, we will test the model with the following sets of features: 
- *year*, *mileage*, *mpg*, *engineSize*, *engineEfficiency*, *brand_encoded*, *model_encoded*, *transmission_encoded*, *tax*.
- *year*, *mileage*, *mpg*, *engineSize*, *engineEfficiency*, *brand_encoded*, *model_encoded*, *transmission_encoded*
- *year*, *mpg*, *engineEfficiency*, *brand_encoded*, *model_encoded*.
