# One Hot Encoding 
One Hot Encoding is needed on the categorical variables to use them for regression or any other kind of computation.
Special attention must be paid to Events, since it has many possible values separated by a dash

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('./dataset/preprocessed_train.csv')

In [3]:
print("Different values of StoreType")
print(dataset.StoreType.unique())

print("\nDifferent values of AssortmentType")
print(dataset.AssortmentType.unique())

print("\nDifferent values of Events")
print(dataset.Events.unique())

Different values of StoreType
['Hyper Market' 'Super Market' 'Standard Market' 'Shopping Center']

Different values of AssortmentType
['General' 'With Non-Food Department' 'With Fish Department']

Different values of Events
['Rain-Snow' 'Snow' 'Rain' nan 'Fog-Rain-Snow' 'Fog' 'Fog-Rain'
 'Rain-Thunderstorm' 'Rain-Hail' 'Fog-Rain-Thunderstorm'
 'Fog-Rain-Hail-Thunderstorm' 'Rain-Snow-Hail' 'Fog-Rain-Snow-Hail'
 'Fog-Snow' 'Snow-Hail' 'Fog-Thunderstorm' 'Thunderstorm' 'Fog-Snow-Hail'
 'Rain-Snow-Hail-Thunderstorm' 'Fog-Rain-Hail' 'Rain-Snow-Thunderstorm'
 'Rain-Hail-Thunderstorm']


In [4]:
dataset.describe()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,Region_GDP,...,Min_VisibilitykM,Precipitationmm,WindDirDegrees,Is_Friday,Is_Monday,Is_Saturday,Is_Sunday,Is_Thursday,Is_Tuesday,Is_Wednesday
count,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,...,424668.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0
mean,1373.937577,0.000945,1.0,0.44951,8043.362656,5.727382,312.570482,4890.450575,12313.293015,14045.557641,...,7.251768,0.801168,168.601992,0.164383,0.164411,0.171142,0.003805,0.155918,0.172603,0.167738
std,216.244819,0.030724,0.0,0.497445,11640.47438,3.357073,158.089665,2215.087195,9114.367114,2760.442742,...,4.924765,2.647928,101.206367,0.370623,0.370648,0.376634,0.061566,0.362778,0.377904,0.373634
min,1000.0,0.0,1.0,0.0,47.0,0.0,10.0,100.0,344.0,9893.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1188.0,0.0,1.0,0.0,1057.0,3.0,214.0,3386.0,7215.0,11849.0,...,3.0,0.0,74.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1373.0,0.0,1.0,0.0,3307.0,6.0,278.0,4470.0,9337.0,15017.0,...,8.0,0.0,188.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1561.0,0.0,1.0,1.0,9761.0,9.0,369.0,5911.0,15566.0,15931.0,...,10.0,0.25,248.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1748.0,1.0,1.0,1.0,85070.0,10.0,2206.0,26641.0,32221.0,23931.0,...,31.0,58.93,360.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## OHE for StoreType

In [5]:
def OHE_StoreType_SuperMarket(value):
    return 1 if value=='Super Market' else 0

def OHE_StoreType_HyperMarket(value):
    return 1 if value=='Hyper Market' else 0

def OHE_StoreType_StandardMarket(value):
    return 1 if value=='Standard Market' else 0

def OHE_StoreType_ShoppingCenter(value):
    return 1 if value=='Shopping Center' else 0

In [6]:
dataset['StoreType_SuperMarket'] = dataset.StoreType.apply(OHE_StoreType_SuperMarket)
dataset['StoreType_HyperMarket'] = dataset.StoreType.apply(OHE_StoreType_HyperMarket)
dataset['StoreType_StandardMarket'] = dataset.StoreType.apply(OHE_StoreType_StandardMarket)
dataset['StoreType_ShoppingCenter'] = dataset.StoreType.apply(OHE_StoreType_ShoppingCenter)

## OHE for AssortmentType

In [7]:
def OHE_AssortmentType_General(value):
    return 1 if value=='General' else 0

def OHE_AssortmentType_WithNFDepartment(value):
    return 1 if value=='With Non-Food Department' else 0

def OHE_AssortmentType_WithFishDepartment(value):
    return 1 if value=='With Fish Department' else 0


In [8]:
dataset['AssortmentType_General'] = dataset.AssortmentType.apply(OHE_AssortmentType_General)
dataset['AssortmentType_WithNFDept'] = dataset.AssortmentType.apply(OHE_AssortmentType_WithNFDepartment)
dataset['AssortmentType_WithFishDept'] = dataset.AssortmentType.apply(OHE_AssortmentType_WithFishDepartment)

In [9]:
dataset.describe()

Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,NearestCompetitor,Region,NumberOfCustomers,NumberOfSales,Region_AreaKM2,Region_GDP,...,Is_Thursday,Is_Tuesday,Is_Wednesday,StoreType_SuperMarket,StoreType_HyperMarket,StoreType_StandardMarket,StoreType_ShoppingCenter,AssortmentType_General,AssortmentType_WithNFDept,AssortmentType_WithFishDept
count,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,...,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0,433926.0
mean,1373.937577,0.000945,1.0,0.44951,8043.362656,5.727382,312.570482,4890.450575,12313.293015,14045.557641,...,0.155918,0.172603,0.167738,0.142137,0.559065,0.284028,0.01477,0.565237,0.429778,0.004985
std,216.244819,0.030724,0.0,0.497445,11640.47438,3.357073,158.089665,2215.087195,9114.367114,2760.442742,...,0.362778,0.377904,0.373634,0.349191,0.4965,0.450951,0.12063,0.495726,0.495045,0.070426
min,1000.0,0.0,1.0,0.0,47.0,0.0,10.0,100.0,344.0,9893.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1188.0,0.0,1.0,0.0,1057.0,3.0,214.0,3386.0,7215.0,11849.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1373.0,0.0,1.0,0.0,3307.0,6.0,278.0,4470.0,9337.0,15017.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,1561.0,0.0,1.0,1.0,9761.0,9.0,369.0,5911.0,15566.0,15931.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
max,1748.0,1.0,1.0,1.0,85070.0,10.0,2206.0,26641.0,32221.0,23931.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Notice that for instance AssortmentType_WithFishDept has only a few cases (since the mean is 0.004) and it refers to all examples, since every store has more than 700 tuples (one per each observation day), it means that only a few stores (e.g. 2 or 3 stores) have a AssortmentType_WithFishDept, we could discard cases like that or we could treat those stores separately

### overwritten preprocessed_train

In [10]:
dataset.to_csv('./dataset/preprocessed_train.csv', index=False)