# Car Sale Price Predictor

# Imports! Imports EVERYWHERE!

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# I. Wrangle Data 
- Here I will define a function named `wrangle` to ensure reproducibility, and to streamline the data cleaning process.

In [6]:
# Wrangle function
def wrangle(filepath):
    # read in dataframe
    X = pd.read_csv(filepath)
    # drop unnecessary columns
    X.drop(columns='Market Category', axis=1, inplace=True)
    # drop columns with '0' sale_price
    X.drop(X[X['MSRP']==0].index, inplace=True)
    # remove outliers
    X.drop(X[X["MSRP"] >= 500000].index, inplace=True)
    # clean column names
    X.rename(columns={"Make": "make",
                      "Model": "model",
                      "Year": "year",
                      "Engine Fuel Type": "fuel_type",
                      "Engine HP": "horsepower",
                      "Engine Cylinders": "cylinders",
                      "Transmission_Type": "transmission",
                      "Driven_Wheels": "drive_type",
                      "Number of Doors": "doors",
                      "Vehicle Size": "size",
                      "Vehicle Style": "body_style",
                      "highway MPG": "mpg_h",
                      "city mpg": "mpg_c",
                      "Popularity": "popularity",
                      "MSRP": "sale_price"}, inplace=True)
    # create feature for age of car
    X['vehicle_age'] = 2017 - X['year']
    # Drop observations with `NaN` values
    X = X.dropna()
    # Type casting `strings` to `category`
    cat_col = [col for col in X.select_dtypes('object').columns]
    X[cat_col] = X[cat_col].astype('category')
    return X
# setting filepath to dataframe
filepath = 'CSV/cars.csv'
# apply wrangle function to dataset
cars = wrangle(filepath)


In [7]:
# confirm functionality of wrangle function
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11801 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   make               11801 non-null  category
 1   model              11801 non-null  category
 2   year               11801 non-null  int64   
 3   fuel_type          11801 non-null  category
 4   horsepower         11801 non-null  float64 
 5   cylinders          11801 non-null  float64 
 6   Transmission Type  11801 non-null  category
 7   drive_type         11801 non-null  category
 8   doors              11801 non-null  float64 
 9   size               11801 non-null  category
 10  body_style         11801 non-null  category
 11  mpg_h              11801 non-null  int64   
 12  mpg_c              11801 non-null  int64   
 13  popularity         11801 non-null  int64   
 14  sale_price         11801 non-null  int64   
 15  vehicle_age        11801 non-null  int64   
dtypes: c

# II. Split Data
- Here I will separate my feature matrix from my target vector

In [9]:
# Separating `Feature Matrix` from `Target Vector` 
target = 'sale_price'
y = cars[target]
X = cars.drop(columns=target)
# Verifying shape `Feature Matrix` and `Target Vector`
print(f'Feature Matrix Shape: {X.shape}')
print(f'Target Vector Shape: {y.shape}')
# Applying training/validation split to `FM` and `TV`
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

Feature Matrix Shape: (11801, 15)
Target Vector Shape: (11801,)


# III. Establish Baseline
- Being that `sale_price` is my target I will use the mean `sale_price` to establish my baseline.

In [13]:
# Establish baseline 
mean_sale_price = y_train.mean()
y_pred = [mean_sale_price] * len(y_train)
# baseline mae for just guessing
baseline_mae = mean_absolute_error(y_train, y_pred)
print(f'Baseline MAE: {baseline_mae}')

Baseline MAE: 24495.238887285443


#  IV. Build Model
- Here I'm building a "quick-and-dirty" model to see if I can beat my baseline.

In [21]:
# basic linear regressor to test against baseline
lin_reg = make_pipeline(OneHotEncoder(use_cat_names=True),
                        SimpleImputer(),
                        StandardScaler(),
                        LinearRegression())
# fit model with training data
lin_reg.fit(X_train, y_train)


  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['make', 'model', 'fuel_type',
                                     'Transmission Type', 'drive_type', 'size',
                                     'body_style'],
                               use_cat_names=True)),
                ('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

# Score Time!

In [26]:
# Mean absolute error for training and validation sets
train_MAE = mean_absolute_error(y_train, lin_reg.predict(X_train))
val_MAE = mean_absolute_error(y_val, lin_reg.predict(X_val))
# return MAE
print(f'Linear Regression Training MAE: {train_MAE}')
print(f'Linear Regression Validation MAE: {val_MAE}')

Linear Regression Training MAE: 3463.7721635190346
Linear Regression Validation MAE: 2282309672614174.5


# Ridge Regression Model
- `Ridge()` Regression model to test against baseline MAE.

In [24]:
# pipeline for ridge regressor
ridge_mod = make_pipeline(
                    OneHotEncoder(use_cat_names=True),
                    StandardScaler(),
                    SimpleImputer(),
                    Ridge())

# fit ridge regressor with training data
ridge_mod.fit(X_train, y_train)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['make', 'model', 'fuel_type',
                                     'Transmission Type', 'drive_type', 'size',
                                     'body_style'],
                               use_cat_names=True)),
                ('standardscaler', StandardScaler()),
                ('simpleimputer', SimpleImputer()), ('ridge', Ridge())])

In [27]:
# Mean absolute error for training and validation sets
ridge_train_MAE = mean_absolute_error(y_train, ridge_mod.predict(X_train))
ridge_val_MAE = mean_absolute_error(y_val, ridge_mod.predict(X_val))
# return MAE
print(f'Ridge Regression Training MAE: {ridge_train_MAE}')
print(f'Ridge Regression Validation MAE: {ridge_val_MAE}')

Ridge Regression Training MAE: 3253.240066091798
Ridge Regression Validation MAE: 3695.326447829763


# Random Forest Regression Model

In [31]:
# Random Forest Regressor pipeline
rfr = make_pipeline(
            OrdinalEncoder(),
            SimpleImputer(),
            RandomForestRegressor())

rfr.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['make', 'model', 'fuel_type',
                                      'Transmission Type', 'drive_type', 'size',
                                      'body_style'],
                                mapping=[{'col': 'make',
                                          'data_type': CategoricalDtype(categories=['Acura', 'Alfa Romeo', 'Aston Martin', 'Audi', 'BMW',
                  'Bentley', 'Buick', 'Cadillac', 'Chevrolet', 'Chrysler',
                  'Dodge', 'FIAT', 'Ferrari', 'Ford', 'GMC', 'Genesis',
                  'HUM...
                  'Regular Cab Pickup', 'Sedan', 'Wagon'],
, ordered=False),
                                          'mapping': Crew Cab Pickup         1
2dr Hatchback           2
Sedan                   3
4dr SUV                 4
Passenger Minivan       5
Extended Cab Pickup     6
Cargo Van               7
Coupe                   8
Regular Cab Pickup      9
2dr SUV                10
Wagon

In [32]:
print(mean_absolute_error(y_train, rfr.predict(X_train)))

1760.3715712957514


In [33]:
print(mean_absolute_error(y_val, rfr.predict(X_val)))

3050.652533506656


# Gradient Boosting Regression Model

In [34]:
xgbr = make_pipeline(
            OrdinalEncoder(),
            SimpleImputer(),
            GradientBoostingRegressor())

xgbr.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['make', 'model', 'fuel_type',
                                      'Transmission Type', 'drive_type', 'size',
                                      'body_style'],
                                mapping=[{'col': 'make',
                                          'data_type': CategoricalDtype(categories=['Acura', 'Alfa Romeo', 'Aston Martin', 'Audi', 'BMW',
                  'Bentley', 'Buick', 'Cadillac', 'Chevrolet', 'Chrysler',
                  'Dodge', 'FIAT', 'Ferrari', 'Ford', 'GMC', 'Genesis',
                  'HUM...
                  'Regular Cab Pickup', 'Sedan', 'Wagon'],
, ordered=False),
                                          'mapping': Crew Cab Pickup         1
2dr Hatchback           2
Sedan                   3
4dr SUV                 4
Passenger Minivan       5
Extended Cab Pickup     6
Cargo Van               7
Coupe                   8
Regular Cab Pickup      9
2dr SUV                10
Wagon

In [36]:
print(mean_absolute_error(y_train, xgbr.predict(X_train)))
print(mean_absolute_error(y_val, xgbr.predict(X_val)))

5295.253146518181
5291.792732270664
