In [18]:
import pandas
import os


data_directory = '../../Data/Raw/retail/'
output_directory='../../Data/Processed/retail/'


# Load Data 

In [19]:
features_file = data_directory + '/Features data set.csv'
sales_file    = data_directory + '/sales data-set.csv'
stores_file   = data_directory + '/stores data-set.csv'

# read features
df_features = pandas.read_csv(features_file)
print('Features: ', df_features.shape)

# read sales
df_sales = pandas.read_csv(sales_file)
print('Sales: ', df_sales.shape)

# read stores
df_stores = pandas.read_csv(stores_file)
print('Stores: ', df_stores.shape)


Features:  (8190, 12)
Sales:  (59000, 5)
Stores:  (45, 3)


# Features

In [20]:
df_features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,05/02/2010,42.31,2.572,,,,,,211.096358,8.106,False
1,1,12/02/2010,38.51,2.548,,,,,,211.24217,8.106,True
2,1,19/02/2010,39.93,2.514,,,,,,211.289143,8.106,False
3,1,26/02/2010,46.63,2.561,,,,,,211.319643,8.106,False
4,1,05/03/2010,46.5,2.625,,,,,,211.350143,8.106,False


# Sales 

In [21]:
df_sales.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,05-02-10,24924.5,False
1,1,1,12-02-10,46039.49,True
2,1,1,19-02-10,41595.55,False
3,1,1,26-02-10,19403.54,False
4,1,1,05-03-10,21827.9,False


# Stores 

In [22]:
df_stores.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


# Fill NaN Values 

In [23]:
nan_rule = {
  'MarkDown1': 0,
  'MarkDown2': 0,
  'MarkDown3': 0,
  'MarkDown4': 0,
  'MarkDown5': 0,
}


for cname, rule in nan_rule.items():
    df_features.loc[:, cname] = df_features[cname].fillna(rule)

print(df_features.shape)

(8190, 12)


# Boolean Variable

In [24]:
df_features['IsHoliday'] = [1 if holiday else 0 for holiday in df_features['IsHoliday'].values]
print('df_features: ', df_features.shape)

df_sales['IsHoliday'] = [1 if holiday else 0 for holiday in df_sales['IsHoliday'].values]
print('df_sales: ', df_sales.shape)

df_features:  (8190, 12)
df_sales:  (59000, 5)


# Datetime Variables 

In [25]:
df_features['Date'] = pandas.to_datetime(df_features['Date'].values, format='%d/%m/%Y')
print('Features: from ', df_features['Date'].min().date(), 'to', df_features['Date'].max().date())

df_sales['Date'] = pandas.to_datetime(df_sales['Date'].values, format='%d-%m-%y')
print('Sales: from ', df_sales['Date'].min().date(), 'to', df_sales['Date'].max().date())

Features: from  2010-02-05 to 2013-07-26
Sales: from  2010-02-05 to 2012-10-26


# Numeric Variables 

In [26]:
# Feature Data
num_variables = ['Temperature', 'Fuel_Price', 'MarkDown1','MarkDown2','MarkDown3', 'MarkDown4', 'MarkDown5',
                 'CPI', 'Unemployment']

for cname in num_variables:
    df_features[cname] = df_features[cname].astype(float, errors='ignore').fillna(0)

    
# Sales Data    
num_variables = ['Weekly_Sales']

for cname in num_variables:
    df_sales[cname] = df_sales[cname].astype(float, errors='ignore').fillna(0)
    
    

# Store Data    
num_variables = ['Size']

for cname in num_variables:
    df_stores[cname] = df_stores[cname].astype(float, errors='ignore').fillna(0)
    

# Categorical Variables 

In [27]:
# Feature Data
cat_variables = ['Store',]

for cname in cat_variables:
    df_features[cname] = df_features[cname].astype(str)

    
# Sales Data    
cat_variables = ['Store', 'Dept']

for cname in cat_variables:
    df_sales[cname] = df_sales[cname].astype(str)
    
    

# Store Data    
cat_variables = ['Store','Type']

for cname in cat_variables:
    df_stores[cname] = df_stores[cname].astype(str)
    

# Prepared Data 

## Features 

In [28]:
df_features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0
1,1,2010-02-12,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,1
2,1,2010-02-19,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,0
3,1,2010-02-26,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,0
4,1,2010-03-05,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,0


## Sales 

In [29]:
df_sales.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,0
1,1,1,2010-02-12,46039.49,1
2,1,1,2010-02-19,41595.55,0
3,1,1,2010-02-26,19403.54,0
4,1,1,2010-03-05,21827.9,0


## Stores 

In [30]:
df_stores.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315.0
1,2,A,202307.0
2,3,B,37392.0
3,4,A,205863.0
4,5,B,34875.0


# Export Data

In [31]:
help(os.makedirs)

Help on function makedirs in module os:

makedirs(name, mode=511, exist_ok=False)
    makedirs(name [, mode=0o777][, exist_ok=False])
    
    Super-mkdir; create a leaf directory and all intermediate ones.  Works like
    mkdir, except that any intermediate path segment (not just the rightmost)
    will be created if it does not exist. If the target directory already
    exists, raise an OSError if exist_ok is False. Otherwise no exception is
    raised.  This is recursive.



In [32]:
os.makedirs(output_directory, exist_ok=True)

features_file = output_directory + '/features.parquet'
sales_file    = output_directory + '/sales.parquet'
stores_file   = output_directory + '/stores.parquet'

# read features
df_features.to_parquet(features_file)
df_sales.to_parquet(sales_file)
df_stores.to_parquet(stores_file)