# Car Sale Price Predictor

# Imports! Imports EVERYWHERE!

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# I. Wrangle Data 
- Here I will define a function named `wrangle` to ensure reproducibility, and to streamline the data cleaning process.

In [71]:
# Wrangle function
def wrangle(filepath):
    # read in dataframe
    X = pd.read_csv(filepath)
    # drop unnecessary columns
    X.drop(columns='Market Category', axis=1, inplace=True)
    # drop columns with '0' sale_price
    X.drop(X[X['MSRP']==0].index, inplace=True)
    # remove outliers
    X.drop(X[X["MSRP"] >= 500000].index, inplace=True)
    # clean column names
    X.rename(columns={"Make": "make",
                      "Model": "model",
                      "Year": "year",
                      "Engine Fuel Type": "fuel_type",
                      "Engine HP": "horsepower",
                      "Engine Cylinders": "cylinders",
                      "Transmission_Type": "transmission",
                      "Driven_Wheels": "drive_type",
                      "Number of Doors": "doors",
                      "Vehicle Size": "size",
                      "Vehicle Style": "body_style",
                      "highway MPG": "mpg_h",
                      "city mpg": "mpg_c",
                      "Popularity": "popularity",
                      "MSRP": "sale_price"}, inplace=True)
    # create feature for age of car
    X['vehicle_age'] = 2017 - X['year']
    # Drop observations with `NaN` values
    X = X.dropna()
    # Type casting `strings` to `category`
    cat_col = [col for col in X.select_dtypes('object').columns]
    X[cat_col] = X[cat_col].astype('category')
    return X
# setting filepath to dataframe
filepath = '../data/data.csv' 
# apply wrangle function to dataset
cars = wrangle(filepath)


In [72]:
# confirm functionality of wrangle function
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11801 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   make               11801 non-null  category
 1   model              11801 non-null  category
 2   year               11801 non-null  int64   
 3   fuel_type          11801 non-null  category
 4   horsepower         11801 non-null  float64 
 5   cylinders          11801 non-null  float64 
 6   Transmission Type  11801 non-null  category
 7   drive_type         11801 non-null  category
 8   doors              11801 non-null  float64 
 9   size               11801 non-null  category
 10  body_style         11801 non-null  category
 11  mpg_h              11801 non-null  int64   
 12  mpg_c              11801 non-null  int64   
 13  popularity         11801 non-null  int64   
 14  sale_price         11801 non-null  int64   
 15  vehicle_age        11801 non-null  int64   
dtypes: c

# II. Split Data
- Here I will separate my feature matrix from my target vector

In [73]:
# Separating `Feature Matrix` from `Target Vector` 
target = 'sale_price'
y = cars[target]
X = cars.drop(columns=target)
# Verifying shape `Feature Matrix` and `Target Vector`
print(f'Feature Matrix Shape: {X.shape}')
print(f'Target Vector Shape: {y.shape}')
# Applying training/validation split to `FM` and `TV`
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

Feature Matrix Shape: (11801, 15)
Target Vector Shape: (11801,)


# III. Establish Baseline
- Being that `sale_price` is my target I will use the mean `sale_price` to establish my baseline.

In [93]:
# Establish baseline 
y_mean = y.mean()
y_pred = [y_mean] * len(y)
y_pred

# baseline mae for just guessing
baseline_r2 = r2_score(y, y_pred)
print(f'Baseline r2: {baseline_r2}')

Baseline r2: 0.0


#  IV. Build Model
- Here I'm building a "quick-and-dirty" model to see if I can beat my baseline.

In [81]:
# basic linear regressor to test against baseline
model_r = make_pipeline(OneHotEncoder(use_cat_names=True),
                        SimpleImputer(),
                        StandardScaler(),
                        Ridge())
# fit model with training data
model_r.fit(X_train, y_train)


  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(cols=['make', 'model', 'fuel_type',
                                     'Transmission Type', 'drive_type', 'size',
                                     'body_style'],
                               use_cat_names=True)),
                ('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()), ('ridge', Ridge())])

# Score Time!

In [95]:
# Mean Absolute Error
model_r2 = r2_score()

ValueError: Found input variables with inconsistent numbers of samples: [2361, 11801]