# Data preprocessing: Feature Engineering

**NOTEBOOK GOAL**: Extracting mean store sales and mean sales per region

**DATASET TRANSFORMATION**: `preprocessed2_OHE_imputation_train.csv` >> `preprocessed_train.csv`


In [20]:
import numpy as np 
import pandas as pd

features_to_print = ['StoreID', 'Date']

In [21]:
df = pd.read_csv("dataset/preprocessed2_OHE_train.csv")

### MeanStoreSales feature

Add the mean of the sales of the store

In [22]:
df_mean = df.groupby('StoreID')['NumberOfSales'].mean()
def mean_sales(value):
    # since stores ID are from 1000 to 1749 let's subtract 1000
    return df_mean.iloc[value-1000]

df['MeanStoreSales'] = df.StoreID.apply(mean_sales)

In [23]:
print(df_mean.shape)
df_mean.head()

(749,)


StoreID
1000    6296.182442
1001    2211.155007
1002    4136.632373
1003    4489.238683
1004    3311.128440
Name: NumberOfSales, dtype: float64

### MeanReagionSales feature

Add the mean of sales for the region (regardless of the store)

In [24]:
df_mean_reg = df.groupby('Region')['NumberOfSales'].mean()

In [25]:
def mean_sales_region(value):
    return df_mean_reg.iloc[value]

df['MeanRegionSales'] = df.Region.apply(mean_sales_region)

In [26]:
features_to_print += ['D_DayOfweek', 'MeanStoreSales', 'MeanRegionSales']

df[df.Date=="01/03/2016"][features_to_print].head().T

Unnamed: 0,0,729,1458,2187,2916
StoreID,1000,1001,1002,1003,1004
Date,01/03/2016,01/03/2016,01/03/2016,01/03/2016,01/03/2016
D_DayOfweek,1,1,1,1,1
MeanStoreSales,6296.18,2211.16,4136.63,4489.24,3311.13
MeanRegionSales,3906.26,4023.59,4884.63,3906.26,3827.76


### Day of the Week considering days adjacency

<https://www.reddit.com/r/MachineLearning/comments/2hzuj5/how_do_i_encode_day_of_the_week_as_a_predictor/>

In [27]:
sorted(df['D_DayOfweek'].unique())

[0, 1, 2, 3, 4, 5, 6]

In [28]:
from math import pi, cos, sin

# angle in rad
df['rad_ang'] = (df['D_DayOfweek'] / 7) * (2 * pi)
#df['D_DayOfWeek_cos'] = df.apply(lambda x: cos(x['rad_ang']), axis=1)
#df['D_DayOfWeek_sin'] = df.apply(lambda x: sin(x['rad_ang']), axis=1)
df['D_DayOfWeek_cos'] = df['rad_ang'].apply(lambda x: cos(x))
df['D_DayOfWeek_sin'] = df['rad_ang'].apply(lambda x: sin(x))

features_to_print  += ['rad_ang', 'D_DayOfWeek_cos', 'D_DayOfWeek_sin']
df[features_to_print].head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
StoreID,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000
Date,01/03/2016,02/03/2016,04/03/2016,05/03/2016,06/03/2016,07/03/2016,08/03/2016,09/03/2016,10/03/2016,11/03/2016,12/03/2016,13/03/2016,14/03/2016,15/03/2016,16/03/2016,17/03/2016,18/03/2016,19/03/2016,20/03/2016,21/03/2016
D_DayOfweek,1,2,4,5,6,0,1,2,3,4,5,6,0,1,2,3,4,5,6,0
MeanStoreSales,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18,6296.18
MeanRegionSales,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26,3906.26
rad_ang,0.897598,1.7952,3.59039,4.48799,5.38559,0,0.897598,1.7952,2.69279,3.59039,4.48799,5.38559,0,0.897598,1.7952,2.69279,3.59039,4.48799,5.38559,0
D_DayOfWeek_cos,0.62349,-0.222521,-0.900969,-0.222521,0.62349,1,0.62349,-0.222521,-0.900969,-0.900969,-0.222521,0.62349,1,0.62349,-0.222521,-0.900969,-0.900969,-0.222521,0.62349,1
D_DayOfWeek_sin,0.781831,0.974928,-0.433884,-0.974928,-0.781831,0,0.781831,0.974928,0.433884,-0.433884,-0.974928,-0.781831,0,0.781831,0.974928,0.433884,-0.433884,-0.974928,-0.781831,0


In [29]:
# let's drop the angle column since it has the same menaing of the day of week
df.drop('rad_ang', inplace=True, axis=1)
df.head(1).T

Unnamed: 0,0
StoreID,1000
Date,01/03/2016
IsHoliday,0
IsOpen,1
HasPromotions,0
StoreType,Hyper Market
AssortmentType,General
NearestCompetitor,326
Region,7
NumberOfCustomers,495


## Rearranging column order

In [30]:
#cols = df.columns.tolist()
#cols

In [31]:
#cols = cols[:10]+cols[11:]+cols[10:11]
#cols

In [32]:
#cols.remove('NumberOfSales')

In [33]:
#cols.append('NumberOfSales')

In [34]:
#cols = cols[:-14]+cols[-3:]+cols[-14:-3]

In [35]:
#df=df[cols]

In [36]:
#df.head().T

In [37]:
#df.shape

## Write to file

In [38]:
df.to_csv('./dataset/preprocessed_train.csv', index=False)