In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
car_sales = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


Only 950 rows are usable for most columns, which means about 5% missing data.

In [4]:
# Find missing data
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [5]:
#datatypes
car_sales.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

We notice that our dataset contains non-numerical features, so we need to preprocess them

### 1.Cleaning the Dataset

In [6]:
# Remove rows with missing price values(NaN's)
car_sales.dropna(subset=["Price"], inplace=True)


### 2.Building a Pipline

Since our `car_sales` data has missing numerical values as well as the data isn't all numerical, we'll have to fix these things before we can fit a machine learning model on it.
Because we're modifying columns in our dataframe (filling missing values, converting non-numerical data to numbers) we'll need the `ColumnTransformer` `SimpleImputer` and `OneHotEncoder` classes as well.

In [7]:
from sklearn.pipeline import Pipeline # chains all preprocessing steps together in the correct order
from sklearn.impute import SimpleImputer   #for handling missing data 
from sklearn.preprocessing import OneHotEncoder #converts categorical features into numerical format
from sklearn.compose import ColumnTransformer # applies different preprocessing steps to different columns
from sklearn.model_selection import train_test_split # splits data into training and test sets

In [8]:
#Define the Categorical features
categorical_features = ["Make", "Colour", ]
# Create categorical transformer pipeline
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), # fill missing values with "missing"
    ("onehot", OneHotEncoder(handle_unknown="ignore")) # convert categorical values to one-hot-encoded vectors
])
# Define the door feature
# Since the vast majority of cars have 4 doors, we'll impute the missing `Doors` values as 4.
door_feature = ["Doors"]
# Create door transformer pipeline
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4)) # fill missing values with 4
])

In [9]:
# Define the numerical features
numerical_features = ["Odometer (KM)"]
# Create numerical transformer pipeline 
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")) # fill missing values with the mean
])

Our dataset is heterogeneous. Numbers behave differently from categories, and some columns (like Doors) need special handling. Instead of forcing one preprocessing rule on everything, you split the columns by type and treat each group properly.

In [10]:
# Put all the preprocessor steps together in one pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
        ("door", door_transformer, door_feature)
    ]
)

### Import some regression models to try out.

In [11]:
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

 We'll create a dictionary of regression models and an empty dictionary for regression model results.

In [12]:
# Create dictionary of model instances, there should be 4 total key, value pairs in the form {"model_name": model_instance}.
regression_models = {"Ridge": Ridge(),
                     "SVR_linear": SVR(kernel="linear"),
                     "SVR_rbf": SVR(kernel="rbf"),
                     "RandomForestRegressor": RandomForestRegressor()}

# Create an empty dictionary for the regression results
regression_results = {}

Now we  split the data into `X` (feature variables) and `y` (target variable) as well as training and test sets.

In [13]:
# Create car sales X data (every column of car_sales except Price)
car_sales_X = car_sales.drop("Price", axis=1)

# Create car sales y data (the Price column of car_sales)
car_sales_y = car_sales["Price"]

In [14]:
car_sales_X_train, car_sales_X_test, car_sales_y_train, car_sales_y_test = train_test_split(
    car_sales_X,
    car_sales_y,
    test_size=0.2,
    random_state=42
)


car_sales_X_train.shape, car_sales_X_test.shape, car_sales_y_train.shape, car_sales_y_test.shape

((760, 4), (190, 4), (760,), (190,))

In [15]:
for model_name, model in regression_models.items():
    
    # Create a model pipeline with a preprocessor step and model step
    model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                      ("model", model)])
    
    # Fit the model pipeline to the car sales training data
    print(f"Fitting {model_name}...")
    model_pipeline.fit(car_sales_X_train, car_sales_y_train)
    
    # Score the model pipeline on the test data appending the model_name to the 
    # results dictionary
    print(f"Scoring {model_name}...")
    regression_results[model_name] = model_pipeline.score(car_sales_X_test,
                                                          car_sales_y_test)

Fitting Ridge...
Scoring Ridge...
Fitting SVR_linear...
Scoring SVR_linear...
Fitting SVR_rbf...
Scoring SVR_rbf...
Fitting RandomForestRegressor...
Scoring RandomForestRegressor...


In [17]:
# Check the results of each regression model by printing the regression_results dictionary
regression_results

{'Ridge': 0.2538755487831501,
 'SVR_linear': 0.08555518343236379,
 'SVR_rbf': 0.0018786027505235392,
 'RandomForestRegressor': 0.22426672362104805}

Ridge (~0.25) performs best → mostly linear relationships.

Random Forest (~0.22) is close → weak non-linear patterns.

SVR models perform poorly, especially RBF (~0.00).

Low scores overall indicate feature quality is the main limitation, not the models

In [18]:
# Import mean_absolute_error from sklearn's metrics module
from sklearn.metrics import mean_absolute_error

# Import mean_squared_error from sklearn's metrics module
from sklearn.metrics import mean_squared_error

# Import r2_score from sklearn's metrics module
from sklearn.metrics import r2_score

All the evaluation metrics we're concerned with compare a model's predictions with the ground truth labels. Knowing this, we'll have to make some predictions.

Let's create a `Pipeline` with the `preprocessor` and a `Ridge()` model, fit it on the car sales training data and then make predictions on the car sales test data.

In [19]:
# Create RidgeRegression Pipeline with preprocessor as the "preprocessor" and
# Ridge() as the "model".
# we use preprocessor defined earlier since it handles missing data and categorical variables
ridge_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                 ("model", Ridge())])

# Fit the RidgeRegression Pipeline to the car sales training data
ridge_pipeline.fit(car_sales_X_train, car_sales_y_train)

# Make predictions on the car sales test data using the RidgeRegression Pipeline
car_sales_y_preds = ridge_pipeline.predict(car_sales_X_test)

# View the first 50 predictions
car_sales_y_preds[:50]

array([18515.50909062, 22204.2553879 , 11047.53630357,  6894.27134579,
        8795.32558662, 10929.54001986, 15268.7188806 , 13837.53187432,
       20208.38496415, 14377.94893894,  6218.07814158, 16543.03154019,
       11786.09157355, 13495.17509856, 14323.06047775, 16426.90461919,
       16001.94613096,  9927.08626267, 11578.87993636, 11582.10276812,
       10649.47261105, 13080.09250477, 17876.05869452, 23451.49602434,
       11798.26034157, 14482.70339352, 18431.83189003, 14682.66277883,
       20586.50324122, 19954.3828007 , 18164.00805254, 22367.20224046,
       12438.38759302, 14230.11057818, 18351.80022107, 19564.66460544,
       12215.36793479, 12481.1975504 , 18678.04204569, 11287.39004894,
       15279.49440485, 17380.59749049, 19248.65523566, 17345.43079543,
       15013.29011173, 12726.87553313, 12392.658628  ,  8477.08843716,
       15257.44206824, 18603.07734634])

Nice! Now we've got some predictions, time to evaluate them. We'll find the mean squared error (MSE), mean absolute error (MAE) and R^2 score (coefficient of determination) of our model.

In [20]:
# Find the MSE by comparing the car sales test labels to the car sales predictions
mse = mean_squared_error(car_sales_y_test, car_sales_y_preds)
# Return the MSE
mse

49960264.20503399

In [21]:
mae= mean_absolute_error(car_sales_y_test, car_sales_y_preds)
mae

5714.468592363157

In [22]:
r2= r2_score(car_sales_y_test, car_sales_y_preds)
r2

0.2538755487831501