In [1]:
# Necessary imports
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import glob
import re

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

print('Libraries imported')

Libraries imported


In [2]:
path = r'C:\Metis_Bootcamp\Regression_project\data\joined-data' # data path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    data = pd.read_csv(filename, index_col=0, header=0)
    li.append(data)

df = pd.concat(li, axis=0, ignore_index=True)

df.head()

Unnamed: 0,Name,Mileage,Address,Rating,Fuel Type,City MPG,Highway MPG,Drivetrain,Engine,Exterior Color,Interior Color,Transmission,Entertainment,Safety,Price
0,2012 Honda Civic EX,151000,"Tacoma, WA 98409",3.0,Gasoline,28,39,FWD,1.8L I4 16V MPFI SOHC,Gray,Gray,5-Speed Automatic,['Bluetooth'],"['Brake Assist', 'Stability Control']","$6,900"
1,2016 Toyota Prius Two,76956,"Tacoma, WA 98409",4.8,Hybrid,54,50,FWD,1.8L I4 16V MPFI DOHC Hybrid,Magnetic Gray Metallic,-1,Automatic CVT,['Bluetooth'],"['Backup Camera', 'Brake Assist', 'Stability C...","$15,000"
2,2020 Subaru Legacy Base,7633,"Tacoma, WA 98409",4.2,Gasoline,27,35,AWD,2.5L H4 16V GDI DOHC,Crystal White Pearl,Gray,Automatic CVT,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...","$23,598"
3,2020 Mazda CX-5 Touring,27188,"Tacoma, WA 98409",4.5,Gasoline,25,31,FWD,2.5L I4 16V GDI DOHC,Sonic Silver Metallic,Black,6-Speed Automatic,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...","$25,900"
4,2020 Hyundai Santa Fe SEL 2.4,20404,"Tacoma, WA 98409",4.2,Gasoline,21,27,AWD,2.4L I4 16V GDI DOHC,Symphony Silver,Black,8-Speed Automatic,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...","$26,996"


In [3]:
df.shape

(10697, 15)

In [4]:
#Strip 'Certified' from the beginning and extract the year from the name

df['Name'] = df['Name'].apply(lambda x: x.replace('Certified ','').strip())
df['Model Year'] = df['Name'].apply(lambda x: x.split(' ')[0]).astype(int)
df['Model Year']

0        2012
1        2016
2        2020
3        2020
4        2020
         ... 
10692    2016
10693    2001
10694    2016
10695    2014
10696    2017
Name: Model Year, Length: 10697, dtype: int32

In [5]:
#Assign categorical labels for model year
for i in range(len(df)):
    if df['Model Year'][i] >= 2012:
        df['Model Year'][i] = 'New'
    elif df['Model Year'][i] < 2012 and df['Model Year'][i] >= 2000:
        df['Model Year'][i] = 'Moderate'
    else:
        df['Model Year'][i] = 'Old'

In [6]:
df.head()

Unnamed: 0,Name,Mileage,Address,Rating,Fuel Type,City MPG,Highway MPG,Drivetrain,Engine,Exterior Color,Interior Color,Transmission,Entertainment,Safety,Price,Model Year
0,2012 Honda Civic EX,151000,"Tacoma, WA 98409",3.0,Gasoline,28,39,FWD,1.8L I4 16V MPFI SOHC,Gray,Gray,5-Speed Automatic,['Bluetooth'],"['Brake Assist', 'Stability Control']","$6,900",New
1,2016 Toyota Prius Two,76956,"Tacoma, WA 98409",4.8,Hybrid,54,50,FWD,1.8L I4 16V MPFI DOHC Hybrid,Magnetic Gray Metallic,-1,Automatic CVT,['Bluetooth'],"['Backup Camera', 'Brake Assist', 'Stability C...","$15,000",New
2,2020 Subaru Legacy Base,7633,"Tacoma, WA 98409",4.2,Gasoline,27,35,AWD,2.5L H4 16V GDI DOHC,Crystal White Pearl,Gray,Automatic CVT,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...","$23,598",New
3,2020 Mazda CX-5 Touring,27188,"Tacoma, WA 98409",4.5,Gasoline,25,31,FWD,2.5L I4 16V GDI DOHC,Sonic Silver Metallic,Black,6-Speed Automatic,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...","$25,900",New
4,2020 Hyundai Santa Fe SEL 2.4,20404,"Tacoma, WA 98409",4.2,Gasoline,21,27,AWD,2.4L I4 16V GDI DOHC,Symphony Silver,Black,8-Speed Automatic,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...","$26,996",New


In [7]:
#Converse mileage and price to int

df['Mileage'] = df['Mileage'].apply(lambda x: x.replace(',','')).astype(int)

df['Price'] = df['Price'].apply(lambda x: x.replace('$','')).apply(lambda x: x.replace(',','')).astype(int)

df.head()

Unnamed: 0,Name,Mileage,Address,Rating,Fuel Type,City MPG,Highway MPG,Drivetrain,Engine,Exterior Color,Interior Color,Transmission,Entertainment,Safety,Price,Model Year
0,2012 Honda Civic EX,151000,"Tacoma, WA 98409",3.0,Gasoline,28,39,FWD,1.8L I4 16V MPFI SOHC,Gray,Gray,5-Speed Automatic,['Bluetooth'],"['Brake Assist', 'Stability Control']",6900,New
1,2016 Toyota Prius Two,76956,"Tacoma, WA 98409",4.8,Hybrid,54,50,FWD,1.8L I4 16V MPFI DOHC Hybrid,Magnetic Gray Metallic,-1,Automatic CVT,['Bluetooth'],"['Backup Camera', 'Brake Assist', 'Stability C...",15000,New
2,2020 Subaru Legacy Base,7633,"Tacoma, WA 98409",4.2,Gasoline,27,35,AWD,2.5L H4 16V GDI DOHC,Crystal White Pearl,Gray,Automatic CVT,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...",23598,New
3,2020 Mazda CX-5 Touring,27188,"Tacoma, WA 98409",4.5,Gasoline,25,31,FWD,2.5L I4 16V GDI DOHC,Sonic Silver Metallic,Black,6-Speed Automatic,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...",25900,New
4,2020 Hyundai Santa Fe SEL 2.4,20404,"Tacoma, WA 98409",4.2,Gasoline,21,27,AWD,2.4L I4 16V GDI DOHC,Symphony Silver,Black,8-Speed Automatic,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...",26996,New


In [8]:
df.describe()

Unnamed: 0,Mileage,Rating,City MPG,Highway MPG,Price
count,10697.0,10697.0,10697.0,10697.0,10697.0
mean,44110.236702,4.269608,21.128915,28.163784,30474.719735
std,40150.680685,0.786603,15.155203,21.182875,17791.743255
min,1.0,1.0,-1.0,-1.0,995.0
25%,19785.0,3.9,17.0,23.0,19305.0
50%,31205.0,4.6,20.0,27.0,27543.0
75%,57411.0,4.8,24.0,32.0,37915.0
max,385906.0,5.0,224.0,384.0,339880.0


In [9]:
#Create 2 new features 'Number of entertainment features' and 'Number of Safety features'

df['Num_entertain_fea'] = df['Entertainment'].apply(lambda x: len(x.split(',')))

df['Num_safe_fea'] = df['Safety'].apply(lambda x: len(x.split(',')))

In [10]:
#Categorize our features:

for i in range(len(df)):
    if 'Automatic' in df.Transmission[i]:
        df['Transmission'][i] = 'Automatic'
    elif 'Manual' in df.Transmission[i]:
        df['Transmission'][i] = 'Manual'
    else:
        df['Transmission'][i] = 'Unknown'

        
        
for i in range(len(df)):
    if 'Intercooled Turbo' in df.Engine[i]:
        df.Engine[i] = 'Intercooled Turbo'
    elif 'Turbo' in df.Engine[i]:
        df.Engine[i] = 'Turbo'
    elif 'Supercharged' in df.Engine[i]:
        df.Engine[i] = 'Supercharged'
    else:
        df.Engine[i] = 'Regular'
        
df['Drivetrain'] = df['Drivetrain'].apply(lambda x: np.nan if x == '-1' else x)

for i in range(len(df)):
    if 'Black' in df['Exterior Color'][i]:
        df['Exterior Color'][i] = 'Black'
    elif 'White' in df['Exterior Color'][i]:
        df['Exterior Color'][i] = 'White'
    elif 'Gray' in df['Exterior Color'][i]:
        df['Exterior Color'][i] = 'Gray'
    elif 'Silver' in df['Exterior Color'][i]:
        df['Exterior Color'][i] = 'Silver'
    else:
        df['Exterior Color'][i] = 'Other'
        
        
for i in range(len(df)):
    if 'Black' in df['Interior Color'][i]:
        df['Interior Color'][i] = 'Black'
    elif 'Gray' in df['Interior Color'][i]:
        df['Interior Color'][i] = 'Gray'
    elif 'Ebony' in df['Interior Color'][i]:
        df['Interior Color'][i] = 'Ebony'
    else:
        df['Interior Color'][i] = 'Other'
        
        
df['City MPG'] = df['City MPG'].apply(lambda x: np.nan if x == -1 else x)

df['City MPG'].fillna(df['City MPG'].median(), inplace=True)

df['Highway MPG'] = df['Highway MPG'].apply(lambda x: np.nan if x == -1 else x)

df['Highway MPG'].fillna(df['Highway MPG'].median(), inplace=True)

In [11]:
df.dropna(inplace=True)

In [15]:
df.sample(10)

Unnamed: 0,Name,Mileage,Address,Rating,Fuel Type,City MPG,Highway MPG,Drivetrain,Engine,Exterior Color,Interior Color,Transmission,Entertainment,Safety,Price,Model Year,Num_entertain_fea,Num_safe_fea
3538,2018 Mercedes-Benz AMG E 63 S 4MATIC,20698,"Bellevue, WA 98005",3.4,Gasoline,16.0,22.0,AWD,Turbo,White,Other,Automatic,"['Bluetooth', 'Premium Sound System']","['Brake Assist', 'Stability Control', 'Blind S...",99950,New,2,3
5810,2013 Audi A5 2.0T Premium Plus,60180,"Auburn, WA 98001",5.0,Gasoline,20.0,30.0,AWD,Turbo,Black,Black,Automatic,"['Bluetooth', 'Premium Sound System']","['Brake Assist', 'Stability Control']",16550,New,2,2
503,2019 Toyota Tundra SR5,13628,"Tacoma, WA 98409",4.9,Gasoline,13.0,17.0,4WD,Regular,White,Other,Automatic,['Bluetooth'],"['Backup Camera', 'Brake Assist', 'Stability C...",50990,New,1,5
8097,2021 Chevrolet Silverado 1500 LTZ,5,"Everett, WA 98203",4.8,Diesel,15.0,22.0,4WD,Turbo,Other,Black,Automatic,"['Bluetooth', 'Premium Sound System']","['Backup Camera', 'Brake Assist', 'Stability C...",56285,New,2,4
6146,2017 Maserati Quattroporte S Q4 GranLusso,21748,"Kirkland, WA 98034",3.7,Gasoline,16.0,23.0,AWD,Turbo,Other,Other,Automatic,"['Bluetooth', 'Premium Sound System']",['Stability Control'],52880,New,2,1
8671,2015 Toyota RAV4 XLE,69508,"Auburn, WA 98001",5.0,Gasoline,24.0,31.0,FWD,Regular,Gray,Black,Automatic,['Bluetooth'],"['Backup Camera', 'Brake Assist', 'Stability C...",16950,New,1,3
4044,2007 Hyundai Elantra GLS,220193,"Everett, WA 98204",4.9,Gasoline,28.0,36.0,FWD,Regular,Black,Gray,Automatic,-1,-1,3475,Moderate,1,1
1155,2018 Subaru Crosstrek 2.0i Limited,14478,"Bellevue, WA 98005",4.6,Gasoline,27.0,33.0,AWD,Regular,White,Black,Automatic,"['Bluetooth', 'Apple CarPlay/Android Auto']","['Backup Camera', 'Brake Assist', 'Stability C...",26982,New,2,6
6025,2018 BMW 330e iPerformance,28227,"Tacoma, WA 98498",3.6,Hybrid,21.0,28.0,RWD,Intercooled Turbo,White,Other,Automatic,['Bluetooth'],['Backup Camera'],22980,New,1,1
9013,2016 Lexus IS 200t Base,81268,"Kirkland, WA 98034",4.8,Gasoline,22.0,33.0,RWD,Turbo,White,Gray,Automatic,"['Bluetooth', 'Premium Sound System']","['Backup Camera', 'Brake Assist', 'Stability C...",23888,New,2,3


In [16]:
df.to_csv('data/cleaned_10000.csv')