# One Hot Encoding 
One Hot Encoding is needed on the categorical variables to use them for regression or any other kind of computation.
Special attention must be paid to Events, since it has many possible values separated by a dash

In [23]:
import pandas as pd
import numpy as np

In [24]:
df = pd.read_csv('./dataset/preprocessed_train.csv')

In [25]:
print("Different values of StoreType")
print(df.StoreType.unique())

print("\nDifferent values of AssortmentType")
print(df.AssortmentType.unique())

print("\nDifferent values of Events")
print(df.Events.unique())

Different values of StoreType
['Hyper Market' 'Super Market' 'Standard Market' 'Shopping Center']

Different values of AssortmentType
['General' 'With Non-Food Department' 'With Fish Department']

Different values of Events
['Rain-Snow' 'Snow' 'Rain' 'None' 'Fog-Rain-Snow' 'Fog' 'Fog-Rain'
 'Rain-Thunderstorm' 'Rain-Hail' 'Fog-Rain-Thunderstorm'
 'Fog-Rain-Hail-Thunderstorm' 'Rain-Snow-Hail' 'Fog-Rain-Snow-Hail'
 'Fog-Snow' 'Snow-Hail' 'Fog-Thunderstorm' 'Thunderstorm' 'Fog-Snow-Hail'
 'Rain-Snow-Hail-Thunderstorm' 'Fog-Rain-Hail' 'Rain-Snow-Thunderstorm'
 'Rain-Hail-Thunderstorm']


In [26]:
df.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,NumberOfCustomers,...,StoreType_StandardMarket,StoreType_ShoppingCenter,AssortmentType_General,AssortmentType_WithNFDept,AssortmentType_WithFishDept,Fog,Hail,Rain,Snow,Thunderstorm
0,1000,2016-03-01,0,1,0,Hyper Market,General,326,7,495,...,0,0,1,0,0,0,0,1,1,0
1,1000,2016-03-02,0,1,0,Hyper Market,General,326,7,608,...,0,0,1,0,0,0,0,0,1,0
2,1000,2016-03-04,0,1,0,Hyper Market,General,326,7,665,...,0,0,1,0,0,0,0,1,0,0
3,1000,2016-03-05,0,1,0,Hyper Market,General,326,7,630,...,0,0,1,0,0,0,0,0,0,0
4,1000,2016-03-07,0,1,1,Hyper Market,General,326,7,763,...,0,0,1,0,0,0,0,0,0,0


## OHE for StoreType

In [27]:
def OHE_StoreType_SuperMarket(value):
    return 1 if value=='Super Market' else 0

def OHE_StoreType_HyperMarket(value):
    return 1 if value=='Hyper Market' else 0

def OHE_StoreType_StandardMarket(value):
    return 1 if value=='Standard Market' else 0

def OHE_StoreType_ShoppingCenter(value):
    return 1 if value=='Shopping Center' else 0

In [28]:
df['StoreType_SuperMarket'] = df.StoreType.apply(OHE_StoreType_SuperMarket)
df['StoreType_HyperMarket'] = df.StoreType.apply(OHE_StoreType_HyperMarket)
df['StoreType_StandardMarket'] = df.StoreType.apply(OHE_StoreType_StandardMarket)
df['StoreType_ShoppingCenter'] = df.StoreType.apply(OHE_StoreType_ShoppingCenter)

## OHE for AssortmentType

In [29]:
def OHE_AssortmentType_General(value):
    return 1 if value=='General' else 0

def OHE_AssortmentType_WithNFDepartment(value):
    return 1 if value=='With Non-Food Department' else 0

def OHE_AssortmentType_WithFishDepartment(value):
    return 1 if value=='With Fish Department' else 0


In [30]:
df['AssortmentType_General'] = df.AssortmentType.apply(OHE_AssortmentType_General)
df['AssortmentType_WithNFDept'] = df.AssortmentType.apply(OHE_AssortmentType_WithNFDepartment)
df['AssortmentType_WithFishDept'] = df.AssortmentType.apply(OHE_AssortmentType_WithFishDepartment)

In [31]:
df.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,NumberOfCustomers,...,StoreType_StandardMarket,StoreType_ShoppingCenter,AssortmentType_General,AssortmentType_WithNFDept,AssortmentType_WithFishDept,Fog,Hail,Rain,Snow,Thunderstorm
0,1000,2016-03-01,0,1,0,Hyper Market,General,326,7,495,...,0,0,1,0,0,0,0,1,1,0
1,1000,2016-03-02,0,1,0,Hyper Market,General,326,7,608,...,0,0,1,0,0,0,0,0,1,0
2,1000,2016-03-04,0,1,0,Hyper Market,General,326,7,665,...,0,0,1,0,0,0,0,1,0,0
3,1000,2016-03-05,0,1,0,Hyper Market,General,326,7,630,...,0,0,1,0,0,0,0,0,0,0
4,1000,2016-03-07,0,1,1,Hyper Market,General,326,7,763,...,0,0,1,0,0,0,0,0,0,0


Notice that for instance AssortmentType_WithFishDept has only a few cases (since the mean is 0.004) and it refers to all examples, since every store has more than 700 tuples (one per each observation day), it means that only a few stores (e.g. 2 or 3 stores) have a AssortmentType_WithFishDept, we could discard cases like that or we could treat those stores separately

## OHE for Events

In [32]:
# Possible values
# 'Fog', 'Hail', 'None', 'Rain', 'Snow', 'Thunderstorm'

events = [
    'Fog',
    'Hail',
    'Rain',
    'Snow',
    'Thunderstorm'
]

for event in events:
    df[event] = df.Events.apply(lambda x: 1 if event in x else 0)

df.head()

Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,AssortmentType,NearestCompetitor,Region,NumberOfCustomers,...,StoreType_StandardMarket,StoreType_ShoppingCenter,AssortmentType_General,AssortmentType_WithNFDept,AssortmentType_WithFishDept,Fog,Hail,Rain,Snow,Thunderstorm
0,1000,2016-03-01,0,1,0,Hyper Market,General,326,7,495,...,0,0,1,0,0,0,0,1,1,0
1,1000,2016-03-02,0,1,0,Hyper Market,General,326,7,608,...,0,0,1,0,0,0,0,0,1,0
2,1000,2016-03-04,0,1,0,Hyper Market,General,326,7,665,...,0,0,1,0,0,0,0,1,0,0
3,1000,2016-03-05,0,1,0,Hyper Market,General,326,7,630,...,0,0,1,0,0,0,0,0,0,0
4,1000,2016-03-07,0,1,1,Hyper Market,General,326,7,763,...,0,0,1,0,0,0,0,0,0,0


### overwritten preprocessed_train

In [33]:
df.to_csv('./dataset/preprocessed_train.csv', index=False)