# Prepare data

In [1]:
import requests
import category_encoders as ce
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def encode_var(var, encoder, y=None):
    if y is None:
        encoder.fit(var)
    else:
        encoder.fit(var, y)
    new_var = encoder.transform(var)
    if isinstance(new_var, pd.DataFrame):
        new_var.insert(0, 'original', var)
        return new_var
    else:
        return pd.DataFrame({'original': var, 'encoder': new_var})
    
def print_res(res, rows_per_level=2):
    out = pd.DataFrame(columns=res.columns)
    for lvl in res.original.unique():
        out = out.append(res[res.original==lvl].head(rows_per_level))
    return out

In [2]:
download_data = False

In [3]:
if download_data:
    url = "http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data"
    r = requests.get(url)
    with open('imports-85.data', 'wb') as f:
        f.write(r.content)

In [4]:
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("imports-85.data",
                  header=None, names=headers, na_values="?" )

df = df[df.price.notnull()]
df["num_cylinders"] = df["num_cylinders"].astype('category').cat.reorder_categories(ordered=True, new_categories=['two', 'three', 'four', 'five', 'six', 'eight', 'twelve'])

# **Exercise 1**

Calculate the regressions of 'num_cylinders' on price for each possible encoding. What is the correct interpretation for the coefficients?

In [5]:
from sklearn.linear_model import LinearRegression

In [6]:
X_raw = df[['num_cylinders']]
y = df['price']

## Ordinal encoder

In [7]:
encoder = ce.OrdinalEncoder()
X = encoder.fit_transform(X_raw)
model = LinearRegression()
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
"For each increase in the level of the number of cylinders, the price changes by {:6.2f} on average".format(model.coef_[0])

'For each increase in the level of the number of cylinders, the price changes by 6490.12 on average'

## One-Hot Encoder

In [9]:
encoder = ce.OneHotEncoder()
X = encoder.fit_transform(X_raw)
model = LinearRegression(fit_intercept=False)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [10]:
model.coef_

array([13020.        ,  5151.        , 10303.19745223, 22007.6       ,
       23671.83333333, 38900.        , 36000.        ])

In [11]:
model.intercept_

0.0

In [12]:
df[['num_cylinders', 'price']].groupby('num_cylinders').mean()

Unnamed: 0_level_0,price
num_cylinders,Unnamed: 1_level_1
two,13020.0
three,5151.0
four,10303.197452
five,22007.6
six,23671.833333
eight,38900.0
twelve,36000.0


In [13]:
"The price of a car with six cylinders is, on average, {:6.2f}.".format(model.coef_[4])

'The price of a car with six cylinders is, on average, 23671.83.'

## Dummy encoding

In [14]:
X = X.iloc[:,1:]
model = LinearRegression()
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
model.coef_

array([-7869.        , -2716.80254777,  8987.6       , 10651.83333333,
       25880.        , 22980.        ])

In [16]:
model.intercept_

13019.999999999987

In [17]:
"The price of a car with six cylinders is, on average, {:6.2f} higher than for cars with two cylinders".format(model.coef_[3])

'The price of a car with six cylinders is, on average, 10651.83 higher than for cars with two cylinders'

## Binary Encoder

In [18]:
encoder = ce.BinaryEncoder()
X = encoder.fit_transform(X_raw)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
model.coef_

array([    0.        , 20212.56365102,  6578.80322664,  -410.89758718])

In [20]:
X.head()

Unnamed: 0,num_cylinders_0,num_cylinders_1,num_cylinders_2,num_cylinders_3
0,0,0,1,1
1,0,0,1,1
2,0,1,0,1
3,0,0,1,1
4,0,1,0,0


No meaning of coefficients. Binary encoding is just a way to express non-numeric values in numbers.

## Base-N Encoding

In [21]:
encoder = ce.BaseNEncoder(base=4)
X = encoder.fit_transform(X_raw)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
model.coef_

array([    0.        , 20294.0675089 ,  2691.47185659])

In [23]:
X.head()

Unnamed: 0,num_cylinders_0,num_cylinders_1,num_cylinders_2
0,0,0,3
1,0,0,3
2,0,1,1
3,0,0,3
4,0,1,0


No meaning of coefficients. Base-N encoding is just a way to express non-numeric values in numbers.

## Simple Encoder

In [24]:
from simple_coding import SimpleEncoder

In [25]:
encoder = SimpleEncoder()
X = encoder.fit_transform(X_raw)
model = LinearRegression(fit_intercept=False)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [26]:
model.coef_

array([21293.37582651, -7869.        , -2716.80254777,  8987.6       ,
       10651.83333333, 25880.        , 22980.        ])

In [27]:
model.intercept_

0.0

In [28]:
X.head()

Unnamed: 0,intercept,num_cylinders_0,num_cylinders_1,num_cylinders_2,num_cylinders_3,num_cylinders_4,num_cylinders_5
0,1,-0.142857,0.857143,-0.142857,-0.142857,-0.142857,-0.142857
1,1,-0.142857,0.857143,-0.142857,-0.142857,-0.142857,-0.142857
2,1,-0.142857,-0.142857,-0.142857,0.857143,-0.142857,-0.142857
3,1,-0.142857,0.857143,-0.142857,-0.142857,-0.142857,-0.142857
4,1,-0.142857,-0.142857,0.857143,-0.142857,-0.142857,-0.142857


In [29]:
means = df[['num_cylinders', 'price']].groupby('num_cylinders').mean()

In [30]:
means.mean()

price    21293.375827
dtype: float64

In [31]:
"The price of a car with three cylinders is, on average, {:6.2f} lower than the price of cars with two cylinders".format(model.coef_[1])

'The price of a car with three cylinders is, on average, -7869.00 lower than the price of cars with two cylinders'

## Sum Encoder

In [32]:
encoder = ce.SumEncoder()
X = encoder.fit_transform(X_raw)
model = LinearRegression(fit_intercept=False)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [33]:
model.coef_

array([ 21293.37582651,  -8273.37582651, -16142.37582651, -10990.17837428,
          714.22417349,   2378.45750682,  17606.62417349])

In [34]:
model.intercept_

0.0

In [35]:
X.head()

Unnamed: 0,intercept,num_cylinders_0,num_cylinders_1,num_cylinders_2,num_cylinders_3,num_cylinders_4,num_cylinders_5
0,1,0.0,0.0,1.0,0.0,0.0,0.0
1,1,0.0,0.0,1.0,0.0,0.0,0.0
2,1,0.0,0.0,0.0,0.0,1.0,0.0
3,1,0.0,0.0,1.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,1.0,0.0,0.0


In [36]:
means - means.mean()

Unnamed: 0_level_0,price
num_cylinders,Unnamed: 1_level_1
two,-8273.375827
three,-16142.375827
four,-10990.178374
five,714.224173
six,2378.457507
eight,17606.624173
twelve,14706.624173


In [37]:
"Each coefficient represents the difference between the group average to the grand average of the price"

'Each coefficient represents the difference between the group average to the grand average of the price'

In [38]:
"Cars with two cylinders are {:6.2f} less expensive than the average price of the groups.".format(model.coef_[1])

'Cars with two cylinders are -8273.38 less expensive than the average price of the groups.'

In [39]:
"The grand mean of {:6.2f} is different than the sample mean of {:6.2f}".format(means.mean()[0], y.mean())

'The grand mean of 21293.38 is different than the sample mean of 13207.13'

## Polynomical Encoder

In [40]:
encoder = ce.PolynomialEncoder()
X = encoder.fit_transform(X_raw)
model = LinearRegression(fit_intercept=False)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [41]:
model.coef_

array([21293.37582651, 28310.79309204,  6016.75905801, -9854.1485843 ,
         380.64101606, -4928.75880307, -4796.8803333 ])

The coefficients represent the linear, quadradic, cubic, etc. trends in the data.

In [42]:
X.head()

Unnamed: 0,intercept,num_cylinders_0,num_cylinders_1,num_cylinders_2,num_cylinders_3,num_cylinders_4,num_cylinders_5
0,1,-0.1889822,-0.327327,0.4082483,0.080582,-0.5455447,0.493464
1,1,-0.1889822,-0.327327,0.4082483,0.080582,-0.5455447,0.493464
2,1,0.1889822,-0.327327,-0.4082483,0.080582,0.5455447,0.493464
3,1,-0.1889822,-0.327327,0.4082483,0.080582,-0.5455447,0.493464
4,1,1.617449e-17,-0.436436,-1.109626e-16,0.483494,-6.714569e-16,-0.657952


## Helmert Encoding

In [43]:
encoder = ce.HelmertEncoder()
X = encoder.fit_transform(X_raw)
model = LinearRegression(fit_intercept=False)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [44]:
model.coef_

array([21293.37582651, -3934.5       ,   405.89915074,  3129.05021231,
        2210.27679406,  4011.54564048,  2451.10402892])

The coefficients represent the change in the grand mean for levels up to k-1 to the grand mean for levels up to k.

In [45]:
means.iloc[:2].mean() - means.iloc[:1].mean()

price   -3934.5
dtype: float64

In [46]:
"The grand mean price for cars with cylinders up to three is {:6.2f} lower than for cars with cylinders up to two".format(model.coef_[1])

'The grand mean price for cars with cylinders up to three is -3934.50 lower than for cars with cylinders up to two'

In [47]:
means.iloc[:3].mean() - means.iloc[:2].mean()

price    405.899151
dtype: float64

In [48]:
"The grand mean price for cars with cylinders up to four is {:6.2f} lower than for cars with cylinders up to three".format(model.coef_[2])

'The grand mean price for cars with cylinders up to four is 405.90 lower than for cars with cylinders up to three'

## Backward Difference Encoder

In [49]:
encoder = ce.BackwardDifferenceEncoder()
X = encoder.fit_transform(X_raw)
model = LinearRegression(fit_intercept=False)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [50]:
model.coef_

array([21293.37582651, -7869.        ,  5152.19745223, 11704.40254777,
        1664.23333333, 15228.16666667, -2900.        ])

In [51]:
means.diff()

Unnamed: 0_level_0,price
num_cylinders,Unnamed: 1_level_1
two,
three,-7869.0
four,5152.197452
five,11704.402548
six,1664.233333
eight,15228.166667
twelve,-2900.0


The coefficients are the differences in average value between adjacent levels.

In [52]:
"Cars with three cylinders are, on average, {:6.2f} less expensive than cars with two cylinders".format(model.coef_[1])

'Cars with three cylinders are, on average, -7869.00 less expensive than cars with two cylinders'

In [53]:
"Cars with four cylinders are, on average, {:6.2f} less expensive than cars with three cylinders".format(model.coef_[2])

'Cars with four cylinders are, on average, 5152.20 less expensive than cars with three cylinders'

## Count Encoder

In [54]:
encoder = ce.CountEncoder(normalize=True)
#X = encoder.fit_transform(X_raw)
model = LinearRegression(fit_intercept=False)
#model.fit(X,y)

But in implementation of CountEncoder with variable of type categorical

## Hashing Encoder

In [55]:
encoder = ce.HashingEncoder()
X = encoder.fit_transform(X_raw)
model = LinearRegression(fit_intercept=False)
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [56]:
model.coef_

array([ 5151.        , 13020.        , 10303.19745223, 36000.        ,
       38900.        , 22007.6       ,     0.        , 23671.83333333])

In [57]:
X.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,0,1,0,0,0,0,0
1,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0


No meaning to coefficients. Hashing is just a way to represent non-numeric data as numbers.

# Exercise 2

Try to find the best encoding for each variable to maximize the generalization performance of a linear regression model to predict the price of a car.

In [58]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

In [59]:
from category_encoders import OrdinalEncoder, OneHotEncoder, BinaryEncoder, SumEncoder, PolynomialEncoder, HelmertEncoder, BackwardDifferenceEncoder, HashingEncoder
from category_encoders import TargetEncoder, JamesSteinEncoder, MEstimateEncoder, LeaveOneOutEncoder, CatBoostEncoder

In [60]:
y = df['price']
X = df.drop('price', axis=1)

In [61]:
cols_num = X.select_dtypes(include=[int, float]).columns.values
cols_cat = X.select_dtypes(exclude=[int, float]).columns.values

In [62]:
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())

In [63]:
encoders = []
for col in cols_cat:
    encoders.append((col, OneHotEncoder(), [col]))
cat_pipeline = ColumnTransformer(encoders)

In [64]:
full_pipeline = make_pipeline(ColumnTransformer([('num', num_pipeline, cols_num), ('cat', cat_pipeline, cols_cat)]), LinearRegression())

In [65]:
param_grid = {'columntransformer__cat__make': [OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__fuel_type': [OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__aspiration': [OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__num_doors': [OrdinalEncoder(), OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__body_style': [OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__drive_wheels': [OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__engine_location': [OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__engine_type': [OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__num_cylinders': [OrdinalEncoder(), OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()],
             'columntransformer__cat__fuel_system': [OneHotEncoder(), BinaryEncoder(), SumEncoder(), PolynomialEncoder(), HelmertEncoder(), BackwardDifferenceEncoder(), HashingEncoder()]}

In [66]:
grid = GridSearchCV(full_pipeline, param_grid, cv=3, n_jobs=11)

In [74]:
# grid.fit(X,y)

Full search fails, due to memory restrictions. Switching to greedy.

In [82]:
steps = []
temp_grid = {}

In [87]:
for col in param_grid.keys():
    param_grid[col]

In [94]:
steps = []
temp_grid = {}
for col in param_grid.keys():
    steps.append(col)
    temp_grid[col] = param_grid[col]
    grid = GridSearchCV(full_pipeline, temp_grid, cv=3, n_jobs=11)
    grid = grid.fit(X,y)
    temp_grid[col] = [grid.best_params_[col]]

In [97]:
grid.best_score_

0.46360157701273547

In [100]:
y_pred = grid.predict(X)

In [101]:
from sklearn.metrics import r2_score

In [103]:
r2_score(y, y_pred)

0.9373104135327257

In [104]:
from sklearn.model_selection import cross_val_predict

In [106]:
y_pred_cv = cross_val_predict(grid, X, y)

In [107]:
r2_score(y, y_pred_cv)

0.28730545020296605