# Overview
#### This notebook will demonstrate usage of $OneHotEncoder$ functionality on a small data set

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

## Load the MPG dataset from seaborn
- $mpg$ is the target variable
- We would use the remaining data as input features

In [3]:
df = sns.load_dataset(name= 'mpg')

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


## Examine the data types of the data
- Notice $origin$ and $name$ are $object$ data type

In [5]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
name             object
dtype: object

### Always check for missing values
- The feature $horsepower$ is missing six

In [10]:
df.isna().sum()[df.isna().sum() > 0]

horsepower    6
dtype: int64

### For this quick demonstration, the instances with missing values will be removed 

In [13]:
df.dropna(inplace= True)

### Examine the descriptive statistics
- By default, describe() considers only numerical data, dtype $\in$ (float, int)
- Adding $include = all$ into the method call will produce some statistics for the non-numerical features
- Notice the $unique, top, freq$ columns are populated with NaN values for numerical data
- And, the descriptive statistic columns are populated with NaN values for non-numerical
- The number of unique values for the non-numerical features represents the cardinality of the column
    - Notice that $origin$ has three unique values. 
    - Specific to this data set, $name$ has high cardinality, 301 unique values in a set of 398 elements 

In [16]:
df.describe(include= 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
mpg,392.0,,,,23.445918,7.805007,9.0,17.0,22.75,29.0,46.6
cylinders,392.0,,,,5.471939,1.705783,3.0,4.0,4.0,8.0,8.0
displacement,392.0,,,,194.41199,104.644004,68.0,105.0,151.0,275.75,455.0
horsepower,392.0,,,,104.469388,38.49116,46.0,75.0,93.5,126.0,230.0
weight,392.0,,,,2977.584184,849.40256,1613.0,2225.25,2803.5,3614.75,5140.0
acceleration,392.0,,,,15.541327,2.758864,8.0,13.775,15.5,17.025,24.8
model_year,392.0,,,,75.979592,3.683737,70.0,73.0,76.0,79.0,82.0
origin,392.0,3.0,usa,245.0,,,,,,,
name,392.0,301.0,amc matador,5.0,,,,,,,


## For this demonstration, let's simplify the dataset
- Remove $name$ and $model\_year$

In [19]:
df.drop(columns= ['name', 'model_year'], inplace= True)

df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,origin
0,18.0,8,307.0,130.0,3504,12.0,usa
1,15.0,8,350.0,165.0,3693,11.5,usa
2,18.0,8,318.0,150.0,3436,11.0,usa
3,16.0,8,304.0,150.0,3433,12.0,usa
4,17.0,8,302.0,140.0,3449,10.5,usa


## Let's get the count of each element in the features $cylinders$

In [22]:
df.cylinders.value_counts()

cylinders
4    199
8    103
6     83
3      4
5      3
Name: count, dtype: int64

## Let's drop the instances with $cylinders = (3, 5)$

In [25]:
df.drop(df.loc[df.cylinders.isin([3, 5])].index, inplace= True)

df.cylinders.value_counts()

cylinders
4    199
8    103
6     83
Name: count, dtype: int64

## Let's assume the six input features are needed
- No feature extraction or reduction
    - If you conduct correlation analysis, you will see a lot of strongly correlated features

## Partition the data

In [29]:
X_train, X_test, y_train, y_test = \
train_test_split(df.drop(columns= ['mpg']), 
                 df['mpg'], 
                 test_size= 0.20, 
                 stratify= df[['cylinders', 'origin']]
                )

# Encoding categorical features
#### Input features $cylinders$ and $origin$ are categorical and need to be encoded
#### Employ OneHotEncoder with default settings

In [32]:
categorical_features = ['cylinders', 'origin']

In [34]:
encoder = OneHotEncoder().fit(X_train[categorical_features])


print(f'{"Feature":<10}{"":>4}{"Unique Values"}')
for i in np.arange(len(encoder.feature_names_in_)):
    print(f'{encoder.feature_names_in_[i]:<10}{"":>4}{encoder.categories_[i]}')
print()
print('Encoded Feature Names')
[print(i) for i in encoder.get_feature_names_out()];

Feature       Unique Values
cylinders     [4 6 8]
origin        ['europe' 'japan' 'usa']

Encoded Feature Names
cylinders_4
cylinders_6
cylinders_8
origin_europe
origin_japan
origin_usa


#### Show ouput when $transform$ is called
- Returns sparse matrix

In [36]:
encoder.transform(X_train[categorical_features])

<308x6 sparse matrix of type '<class 'numpy.float64'>'
	with 616 stored elements in Compressed Sparse Row format>

#### Update the settings in $OneHotEncoder$ to produce dense output

In [39]:
encoder = OneHotEncoder(sparse_output= False).fit(X_train[categorical_features])

encoder.transform(X_train[categorical_features])

array([[0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1.]])

### Updating the Training and Testing data requires a little work...
#### Here, we'll keep all created encoded variables

In [42]:
## Create a copy of the Training Data
## Typically, X_train would get updated
X_train_first_kept = X_train.copy()

## Add the encoded features to the Training Data
X_train_first_kept[encoder.get_feature_names_out()] = encoder.transform(X_train_first_kept[categorical_features])

## Remove the original categorical features
X_train_first_kept.drop(columns= categorical_features, inplace= True)

X_train_first_kept.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders_4,cylinders_6,cylinders_8,origin_europe,origin_japan,origin_usa
230,350.0,170.0,4165,11.4,0.0,0.0,1.0,0.0,0.0,1.0
0,307.0,130.0,3504,12.0,0.0,0.0,1.0,0.0,0.0,1.0
181,91.0,53.0,1795,17.5,1.0,0.0,0.0,0.0,1.0,0.0
281,200.0,85.0,2990,18.2,0.0,1.0,0.0,0.0,0.0,1.0
317,97.0,78.0,2188,15.8,1.0,0.0,0.0,1.0,0.0,0.0


In [44]:
X_test_first_kept = X_test.copy()
X_test_first_kept[encoder.get_feature_names_out()] = encoder.transform(X_test_first_kept[categorical_features])
X_test_first_kept.drop(columns= categorical_features, inplace= True)
X_test_first_kept.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders_4,cylinders_6,cylinders_8,origin_europe,origin_japan,origin_usa
236,140.0,89.0,2755,15.8,1.0,0.0,0.0,0.0,0.0,1.0
362,146.0,120.0,2930,13.8,0.0,1.0,0.0,0.0,1.0,0.0
172,90.0,71.0,2223,16.5,1.0,0.0,0.0,1.0,0.0,0.0
22,104.0,95.0,2375,17.5,1.0,0.0,0.0,1.0,0.0,0.0
100,250.0,88.0,3021,16.5,0.0,1.0,0.0,0.0,0.0,1.0


#### Update the settings in $OneHotEncoder$ to produce dense output and drop the first encoding per feature

In [47]:
encoder = OneHotEncoder(sparse_output= False, drop= 'first').fit(X_train[categorical_features])

print(f'{"Feature":<10}{"":>4}{"Unique Values"}')
for i in np.arange(len(encoder.feature_names_in_)):
    print(f'{encoder.feature_names_in_[i]:<10}{"":>4}{encoder.categories_[i]}')
print()
print('Encoded Feature Names')
[print(i) for i in encoder.get_feature_names_out()];
print()
print('Notice that cylinders_4 and origin_europe are not present.')

Feature       Unique Values
cylinders     [4 6 8]
origin        ['europe' 'japan' 'usa']

Encoded Feature Names
cylinders_6
cylinders_8
origin_japan
origin_usa

Notice that cylinders_4 and origin_europe are not present.


## Update the Training and Testing data to include the encoded features

In [50]:
X_train_first_dropped = X_train.copy()
X_train_first_dropped[encoder.get_feature_names_out()] = encoder.transform(X_train_first_dropped[categorical_features])
X_train_first_dropped.drop(columns= categorical_features, inplace= True)
X_train_first_dropped.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders_6,cylinders_8,origin_japan,origin_usa
230,350.0,170.0,4165,11.4,0.0,1.0,0.0,1.0
0,307.0,130.0,3504,12.0,0.0,1.0,0.0,1.0
181,91.0,53.0,1795,17.5,0.0,0.0,1.0,0.0
281,200.0,85.0,2990,18.2,1.0,0.0,0.0,1.0
317,97.0,78.0,2188,15.8,0.0,0.0,0.0,0.0


In [52]:
X_test_first_dropped = X_test.copy()
X_test_first_dropped[encoder.get_feature_names_out()] = encoder.transform(X_test_first_dropped[categorical_features])
X_test_first_dropped.drop(columns= categorical_features, inplace= True)
X_test_first_dropped.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders_6,cylinders_8,origin_japan,origin_usa
236,140.0,89.0,2755,15.8,0.0,0.0,0.0,1.0
362,146.0,120.0,2930,13.8,1.0,0.0,1.0,0.0
172,90.0,71.0,2223,16.5,0.0,0.0,0.0,0.0
22,104.0,95.0,2375,17.5,0.0,0.0,0.0,0.0
100,250.0,88.0,3021,16.5,1.0,0.0,0.0,1.0


# Does $drop=first$ change the model?

## Create Linear Regression instances and train on the respective data

In [56]:
lin_reg_first_kept = LinearRegression().fit(X= X_train_first_kept, y= y_train)

lin_reg_first_dropped = LinearRegression().fit(X= X_train_first_dropped, y= y_train)

### Compare the weights from the regressors

In [59]:
print(f'{"":>14}{"First Kept":>14}')
print(f'{"Intercept":>14}{np.round(lin_reg_first_kept.intercept_, 4):>14}')
[print(f'{col:>14}{np.round(val, 4):>14}') for col, val in list(zip(X_train_first_kept.columns, lin_reg_first_kept.coef_))];
print()
print(f'{"":>14}{"First Dropped":>14}')
print(f'{"Intercept":>14}{np.round(lin_reg_first_dropped.intercept_, 4):>14}')
[print(f'{col:>14}{np.round(val, 4):>14}') for col, val in list(zip(X_train_first_dropped.columns, lin_reg_first_dropped.coef_))];

                  First Kept
     Intercept       41.4492
  displacement        0.0045
    horsepower       -0.0643
        weight        -0.004
  acceleration       -0.0185
   cylinders_4        1.8843
   cylinders_6        -2.093
   cylinders_8        0.2087
 origin_europe       -0.3946
  origin_japan        1.4989
    origin_usa       -1.1043

               First Dropped
     Intercept       42.9389
  displacement        0.0045
    horsepower       -0.0643
        weight        -0.004
  acceleration       -0.0185
   cylinders_6       -3.9773
   cylinders_8       -1.6756
  origin_japan        1.8935
    origin_usa       -0.7096


### Compare the predictions from the Regressors

In [62]:
np.stack((lin_reg_first_kept.predict(X= X_test_first_kept), lin_reg_first_dropped.predict(X= X_test_first_dropped)), axis= 1)[:20]

array([[25.69530025, 25.69530025],
       [21.68449451, 21.68449451],
       [29.47690647, 29.47690647],
       [27.36368001, 27.36368001],
       [21.18828723, 21.18828723],
       [23.92729366, 23.92729366],
       [16.21965979, 16.21965979],
       [16.34591084, 16.34591084],
       [28.86249021, 28.86249021],
       [18.38358794, 18.38358794],
       [34.85194468, 34.85194468],
       [21.45019442, 21.45019442],
       [15.62760135, 15.62760135],
       [18.78890312, 18.78890312],
       [31.83493229, 31.83493229],
       [32.16948052, 32.16948052],
       [33.85920771, 33.85920771],
       [16.82569336, 16.82569336],
       [20.69779221, 20.69779221],
       [32.28272714, 32.28272714]])