In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

In [2]:
# load dataset
data = '../data/cars.csv'
df = pd.read_csv(data)
df.head(2)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650


In [3]:
import wrangle as wr
df = wr.rename_columns(df)
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [4]:
df_train, df_val, df_test = wr.split_data(df)

In [5]:
df_train.shape[0], df_val.shape[0], df_test.shape[0]

(7150, 2382, 2382)

In [6]:
y_train, y_val, y_test = wr.get_target_vars(df_train, df_val, df_test)

In [7]:
df_train.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity'],
      dtype='object')

In [8]:
y_train

array([ 7.68662133, 12.13243149,  7.60140233, ...,  9.79205307,
       11.66049166, 10.8325168 ])

#### Categorical variables

In [9]:
df.make.unique()

array(['bmw', 'audi', 'fiat', 'mercedes-benz', 'chrysler', 'nissan',
       'volvo', 'mazda', 'mitsubishi', 'ferrari', 'alfa_romeo', 'toyota',
       'mclaren', 'maybach', 'pontiac', 'porsche', 'saab', 'gmc',
       'hyundai', 'plymouth', 'honda', 'oldsmobile', 'suzuki', 'ford',
       'cadillac', 'kia', 'bentley', 'chevrolet', 'dodge', 'lamborghini',
       'lincoln', 'subaru', 'volkswagen', 'spyker', 'buick', 'acura',
       'rolls-royce', 'maserati', 'lexus', 'aston_martin', 'land_rover',
       'lotus', 'infiniti', 'scion', 'genesis', 'hummer', 'tesla',
       'bugatti'], dtype=object)

In [10]:
df.make.nunique()

48

In [11]:
list(df.make.value_counts().head(5).index)

['chevrolet', 'ford', 'volkswagen', 'toyota', 'dodge']

In [12]:
df.isnull().sum()

make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

In [13]:
base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
def prepare_X(df: pd.DataFrame) -> pd.DataFrame:
    '''
    df: dataframe for the baseline model
    cols: numeric column names
    '''
    df = df.copy()
    features = base.copy()

    df['age'] = 2017 - df.year
    features.append('age')
    
    # got through top-5 car brands
    for v in list(df.make.value_counts().head(5).index):
        feature = 'is_make_%s' % v
        df[feature] = (df['make'] == v).astype(int)
        features.append(feature)
    
    # go through number of doors
    for v in [2, 3, 4]:
        feature = 'num_doors_%s' % v
        df[feature] = (df['number_of_doors'] == v).astype(int)
        features.append(feature)


    df_new = df[features]
    df_new = df_new.fillna(0)
    X = df_new.values
    return X

In [15]:
import regression as regr

In [18]:
# cope paste train model/calculate score
X_train = prepare_X(df_train)
w0, w = regr.train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w)
print('train', regr.rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred_val = w0 + X_val.dot(w)
print('validation', regr.rmse(y_val, y_pred_val))

train 0.5026750065739469
validation 0.5075439929907208


In [19]:
# add transmission and engine fuel type
df_train.transmission_type.unique()

array(['automatic', 'manual', 'automated_manual', 'direct_drive',
       'unknown'], dtype=object)

In [20]:
df_train.transmission_type.value_counts() # take top-3 only

automatic           4923
manual              1810
automated_manual     366
direct_drive          40
unknown               11
Name: transmission_type, dtype: int64

In [21]:
df_train.engine_fuel_type.unique()

array(['diesel', 'premium_unleaded_(required)', 'regular_unleaded',
       'premium_unleaded_(recommended)', 'flex-fuel_(unleaded/e85)',
       'flex-fuel_(premium_unleaded_required/e85)', 'electric',
       'flex-fuel_(premium_unleaded_recommended/e85)',
       'flex-fuel_(unleaded/natural_gas)', 'natural_gas', nan],
      dtype=object)

In [22]:
df_train.engine_fuel_type.nunique()

10

In [23]:
df_train.engine_fuel_type.value_counts() # top 4

regular_unleaded                                4281
premium_unleaded_(required)                     1205
premium_unleaded_(recommended)                   937
flex-fuel_(unleaded/e85)                         541
diesel                                            88
electric                                          39
flex-fuel_(premium_unleaded_required/e85)         34
flex-fuel_(premium_unleaded_recommended/e85)      17
flex-fuel_(unleaded/natural_gas)                   3
natural_gas                                        2
Name: engine_fuel_type, dtype: int64

In [24]:
# change prepare

base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
def prepare_X(df: pd.DataFrame) -> pd.DataFrame:
    '''
    df: dataframe for the baseline model
    cols: numeric column names
    '''
    df = df.copy()
    features = base.copy()

    df['age'] = 2017 - df.year
    features.append('age')
    
    # got through top-5 car brands
    for v in list(df.make.value_counts().head(5).index):
        feature = 'is_make_%s' % v
        df[feature] = (df['make'] == v).astype(int)
        features.append(feature)
    
    # go through number of doors
    for v in [2, 3, 4]:
        feature = 'num_doors_%s' % v
        df[feature] = (df['number_of_doors'] == v).astype(int)
        features.append(feature)

    # top-3 transmission
    for v in df.transmission_type.value_counts().head(3):
        feature = 'transmission_%s' % v 
        df[feature] = (df.transmission_type == v).astype('uint8')
        features.append(feature)

    # top-4 engine fuel type
    for v in df.engine_fuel_type.value_counts().head(4):
        feature = 'engine_fuel_%s' % v 
        df[feature] = (df.engine_fuel_type == v).astype('uint8')
        features.append(feature)

    df_new = df[features]
    df_new = df_new.fillna(0)
    X = df_new.values
    return X

In [25]:
# copy paste train model/calculate score
# X_train = prepare_X(df_train)
# w0, w = train_linear_regression(X_train, y_train)

# y_pred = w0 + X_train.dot(w)
# print('train', rmse(y_train, y_pred))

# X_val = prepare_X(df_val)
# y_pred_val = w0 + X_val.dot(w)
# print('validation', rmse(y_val, y_pred_val))

# singular matrix again :(

### Categorical Vars

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_fuel_type   11911 non-null  object 
 4   engine_hp          11845 non-null  float64
 5   engine_cylinders   11884 non-null  float64
 6   transmission_type  11914 non-null  object 
 7   driven_wheels      11914 non-null  object 
 8   number_of_doors    11908 non-null  float64
 9   market_category    8172 non-null   object 
 10  vehicle_size       11914 non-null  object 
 11  vehicle_style      11914 non-null  object 
 12  highway_mpg        11914 non-null  int64  
 13  city_mpg           11914 non-null  int64  
 14  popularity         11914 non-null  int64  
 15  msrp               11914 non-null  int64  
dtypes: float64(3), int64(5

In [27]:
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'msrp'],
      dtype='object')

In [28]:
categorical_vars = ['make',  'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']

In [29]:
categories = {}

for c in categorical_vars:
    categories[c] = list(df[c].value_counts().head().index)

In [30]:
categories

{'make': ['chevrolet', 'ford', 'volkswagen', 'toyota', 'dodge'],
 'engine_fuel_type': ['regular_unleaded',
  'premium_unleaded_(required)',
  'premium_unleaded_(recommended)',
  'flex-fuel_(unleaded/e85)',
  'diesel'],
 'transmission_type': ['automatic',
  'manual',
  'automated_manual',
  'direct_drive',
  'unknown'],
 'driven_wheels': ['front_wheel_drive',
  'rear_wheel_drive',
  'all_wheel_drive',
  'four_wheel_drive'],
 'market_category': ['crossover',
  'flex_fuel',
  'luxury',
  'luxury,performance',
  'hatchback'],
 'vehicle_size': ['compact', 'midsize', 'large'],
 'vehicle_style': ['sedan',
  '4dr_suv',
  'coupe',
  'convertible',
  '4dr_hatchback']}

In [31]:
# change prepare

base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
def prepare_X(df: pd.DataFrame) -> pd.DataFrame:
    '''
    df: dataframe for the baseline model
    cols: numeric column names
    '''
    df = df.copy()
    features = base.copy()

    df['age'] = 2017 - df.year
    features.append('age')
    
    # go through number of doors
    for v in [2, 3, 4]:
        feature = 'num_doors_%s' % v
        df[feature] = (df['number_of_doors'] == v).astype('uint8')
        features.append(feature)

    categorical_vars = ['make',  'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']
    categories = {}

    for c in categorical_vars:
        categories[c] = list(df[c].value_counts().head().index)

    for c, values in categories.items():
        for v in values:
            df['%s_%s' % (c, v)] = (df[c] == v).astype('uint8')
            features.append('%s_%s' % (c, v))


    df_new = df[features]
    df_new = df_new.fillna(0)
    X = df_new.values
    return X

In [32]:
prepare_X(df_train)

array([[134.,   6.,  23., ...,   0.,   0.,   0.],
       [510.,  12.,  19., ...,   1.,   0.,   0.],
       [100.,   4.,  29., ...,   0.,   1.,   0.],
       ...,
       [143.,   4.,  26., ...,   0.,   0.,   0.],
       [605.,   8.,  24., ...,   0.,   0.,   0.],
       [290.,   6.,  27., ...,   0.,   0.,   0.]])

In [33]:
# cope paste train model/calculate score
X_train = prepare_X(df_train)
w0, w = regr.train_linear_regression(X_train, y_train)

y_pred = w0 + X_train.dot(w)
print('train', regr.rmse(y_train, y_pred))

X_val = prepare_X(df_val)
y_pred_val = w0 + X_val.dot(w)
print('validation', regr.rmse(y_val, y_pred_val))

train 912.5690053266312
validation 1427.9090319413954


In [34]:
# rmse score now is way too high