![title](https://pythonawesome.com/content/images/2018/05/scikit-learn.png)

In [1]:
# import standard libraries for this lecture
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import MultiLabelBinarizer

# Data Encoding

### One Hot Encoder

In [2]:
enc = OneHotEncoder(handle_unknown='ignore')

In [3]:
CatX = [['Ensuite'], ['Outside private toilet'], ['Shared Bathroom inside'], ['Shared bathroom outside']]

In [4]:
enc.fit(CatX)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [5]:
enc.categories_

[array(['Ensuite', 'Outside private toilet', 'Shared Bathroom inside',
        'Shared bathroom outside'], dtype=object)]

In [6]:
X = [['Ensuite'], ['Ensuite'], ['Outside private toilet'], ['Ensuite'], ['Shared Bathroom inside'], ['Shared bathroom outside'], ['Shared bathroom outside'], ['Ensuite']]


In [7]:
enc.transform(X).toarray()

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.]])

__Careful!__

One has to deal with sparsity of arrays.

### Ordinal Encoder

In [None]:
enc = OrdinalEncoder()

In [9]:
enc.fit(CatX)

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [10]:
enc.categories_

[array(['Ensuite', 'Outside private toilet', 'Shared Bathroom inside',
        'Shared bathroom outside'], dtype=object)]

In [11]:
enc.transform(X)

array([[0.],
       [0.],
       [1.],
       [0.],
       [2.],
       [3.],
       [3.],
       [0.]])

## My advice: write your own function

Consider a dataset like this

| Name    |       Favourite sports          | Other random skill |
|--|---|--|
| Alice   | \[Football, Basketball, Tennis\]| 1.0             |
| Bob     | \[Tennis, Rugby\]               | 4.3             |
| Charlie | \[Volleyball, Curling         \]| 5.6             |
| Daniel  | \[Climbing                    \]| 0.7             |
| Freeda  | \[Cycling                     \]| 3.5             |
| Gemma   | \[Football, Running           \]| 2.1             |

In [12]:
df = pd.DataFrame({'Name':['Alice', 'Bob', 'Charlie', 'Daniel', 'Freeda', 'Gemma'], 
                   'FavouriteSports': [['Football', 'Basketball', 'Tennis'], 
                                       ['Tennis', 'Rugby'], ['Volleyball', 'Curling'], 
                                       ['Climbing'], ['Cycling'], ['Football', 'Running']], 
                   'OtherSkill': [1.0, 4.3, 5.6, 0.7, 3.5, 2.1],
                   'OtherCategSkill': [['a', 'f'], ['c', 'd', 'e'], ['b', 'c', 'a'], ['d', 'c'], ['b', 'c', 'a'], ['b', 'e', 'f'] ]})

__Questions__:

> How would you convert this dataframe into a format that a machine learning algorithm would digest?

> Which kind of encoder would you use? 

In [13]:
def HomeMadeEncoderElem(df, column):
    """
        :return: an encoded dataframe. 
        The argument column is replaced by a number of columns corresponding to the hotEncoder classes.
    """
    df_work = df.copy()
    mlb = MultiLabelBinarizer()
    df_work = df_work.join(pd.DataFrame(mlb.fit_transform(df_work.pop(column)),
                          columns=mlb.classes_,
                          index=df_work.index))
    
    return df_work

In [15]:
HomeMadeEncoderElem(df, column= 'FavouriteSports')

Unnamed: 0,Name,OtherCategSkill,OtherSkill,Basketball,Climbing,Curling,Cycling,Football,Rugby,Running,Tennis,Volleyball
0,Alice,"[a, f]",1.0,1,0,0,0,1,0,0,1,0
1,Bob,"[c, d, e]",4.3,0,0,0,0,0,1,0,1,0
2,Charlie,"[b, c, a]",5.6,0,0,1,0,0,0,0,0,1
3,Daniel,"[d, c]",0.7,0,1,0,0,0,0,0,0,0
4,Freeda,"[b, c, a]",3.5,0,0,0,1,0,0,0,0,0
5,Gemma,"[b, e, f]",2.1,0,0,0,0,1,0,1,0,0


## Exercises

### Exercise 1

In [19]:
def HomeMadeEncoder(df, columns):
    """
        :return: an encoded dataframe. 
        The argument columns is a list and each of its element is replaced by a number of columns 
        corresponding to the hotEncoder classes.
    """
    
    df_work = df.copy()
    if isinstance(columns, list):
        for col in columns:
            df_work = HomeMadeEncoderElem(df_work, col)
        return df_work
    elif isinstance(columns, str):
        return HomeMadeEncoderElem(df_work, columns)
    else:
        raise ValueError('Dude, columns can only be a string or a list')

In [20]:
HomeMadeEncoder(df, columns= ['FavouriteSports', 'OtherCategSkill'])

Unnamed: 0,Name,OtherSkill,Basketball,Climbing,Curling,Cycling,Football,Rugby,Running,Tennis,Volleyball,a,b,c,d,e,f
0,Alice,1.0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,1
1,Bob,4.3,0,0,0,0,0,1,0,1,0,0,0,1,1,1,0
2,Charlie,5.6,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0
3,Daniel,0.7,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0
4,Freeda,3.5,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0
5,Gemma,2.1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1


In [21]:
HomeMadeEncoder(df, 'FavouriteSports')

Unnamed: 0,Name,OtherCategSkill,OtherSkill,Basketball,Climbing,Curling,Cycling,Football,Rugby,Running,Tennis,Volleyball
0,Alice,"[a, f]",1.0,1,0,0,0,1,0,0,1,0
1,Bob,"[c, d, e]",4.3,0,0,0,0,0,1,0,1,0
2,Charlie,"[b, c, a]",5.6,0,0,1,0,0,0,0,0,1
3,Daniel,"[d, c]",0.7,0,1,0,0,0,0,0,0,0
4,Freeda,"[b, c, a]",3.5,0,0,0,1,0,0,0,0,0
5,Gemma,"[b, e, f]",2.1,0,0,0,0,1,0,1,0,0


### Exercise 2

Apply the suitable encoder to a set of categorical data of the following dataframe.

In [27]:
df_airbnb = pd.read_pickle('Data/PolReg/airbnb_data.pickle')

Consider the columns

- check_in_time_start
- check_in_time_end

_Hint:_ it might be necessary to make some data cleaning on data.

__Example__:

In [28]:
# Write down the command to make a slice of df_airbnb by the three selected columns
cat_cols = ['check_in_time_start', 'check_in_time_end']
df_cat_slice = df_airbnb.loc[:, cat_cols]

In [29]:
df_airbnb.check_in_time_start.unique()

array(['15', '9', '11', '17', 'FLEXIBLE', '12', '16', 'NOT_SELECTED',
       '10', '18', '13', '14', '8', '19', '20', '0', '2', '3', '6', '21',
       '7', '25'], dtype=object)

We may suppose that a ```'NOT_SELECTED'``` start time is equivalent to ```'FLEXIBLE'```.

We may proceed as the function above, however, we have to split the column addition from the `pop` phase.

Having this in mind, let's modify the code above to allow the second argument to be a column list.

In [34]:
def HomeMadeEncoderOrd(df, columns):
    """
        :return: an encoded dataframe. 
        The argument column is replaced by a number of columns corresponding to the OrdinalEncoder classes.
    """
    
    df_work = df.copy()
    cat = []
    enc = OrdinalEncoder()
    for i, col in enumerate(columns):
        cat.append(df_work[col].unique())
        enc.fit(cat[i].reshape(-1,1))
        enclist = enc.transform(np.array(df_work[col]).reshape(-1,1))
        df_work[col + '_enc'] = enclist
    
    
    return df_work

In [30]:
categ = df_cat_slice['check_in_time_start'].unique()
enc.fit(categ.reshape(-1,1))

OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)

In [31]:
np.array(df_cat_slice['check_in_time_start'])

array(['15', '9', '11', ..., '14', '17', '10'], dtype=object)

In [32]:
enc.transform(np.array(df_cat_slice['check_in_time_start']).reshape(-1, 1))

array([[ 6.],
       [19.],
       [ 2.],
       ...,
       [ 5.],
       [ 8.],
       [ 1.]])

In [35]:
HomeMadeEncoderOrd(df_cat_slice, cat_cols)

Unnamed: 0,check_in_time_start,check_in_time_end,check_in_time_start_enc,check_in_time_end_enc
0,15,21,6.0,11.0
1,9,NOT_SELECTED,19.0,18.0
2,11,23,2.0,13.0
3,17,20,8.0,10.0
4,FLEXIBLE,FLEXIBLE,20.0,17.0
5,15,22,6.0,12.0
6,15,21,6.0,11.0
7,FLEXIBLE,FLEXIBLE,20.0,17.0
8,12,25,3.0,15.0
9,16,FLEXIBLE,7.0,17.0
