## Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Reading data

In [7]:
df_holidays = pd.read_csv('holidays_events.csv', header = 0)
df_oil = pd.read_csv('oil.csv', header = 0)
df_stores = pd.read_csv('stores.csv', header = 0)
df_trans = pd.read_csv('transactions.csv', header = 0)
df_train = pd.read_csv('train.csv', header = 0)
df_test = pd.read_csv('test.csv', header = 0)

In [14]:
# Create a list of DataFrames
dataframes = [df_holidays, df_oil, df_trans, df_train, df_test]
# Loop through the list of DataFrames and convert the 'date' column to datetime
for df in dataframes:
    df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d")

In [16]:
df_holidays.head(3)

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False


In [17]:
df_oil.head(3)

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97


In [18]:
df_stores.head(3)

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8


In [19]:
df_trans.head(3)

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358


In [20]:
df_train.head(3)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0


#### Since our dataset contains numerous rows, it would be simpler to organize the data, for instance, by week or month. We'll aggregate the data using the mean (in case of sum we use the "agg.(mean=(col, 'mean'))").

In [44]:
def grouped(df, key, freq, col, group_col=None):
    """ GROUP DATA WITH CERTAIN FREQUENCY AND ANOTHER COLUMN """
    if group_col:
        return df.groupby([pd.Grouper(key=key, freq=freq), group_col]).agg(sum=(col, 'sum')).reset_index()
    else:
        return df.groupby(pd.Grouper(key=key, freq=freq)).agg(sum=(col, 'sum')).reset_index()

In [45]:
# check grouped data.  'D' for day, 'W' for week, 'M' for month, 'Q' for quarter, 'Y' for year
df_grouped_trans_w = grouped(df_trans, 'date', 'W', 'transactions')
df_grouped_trans_w.head(3)

Unnamed: 0,date,sum
0,2013-01-06,435020
1,2013-01-13,528431
2,2013-01-20,527766


In [47]:
df_grouped_train_w = grouped(df_train, 'date', 'W', 'sales', 'family')
df_grouped_train_w.head(3)

Unnamed: 0,date,family,sum
0,2013-01-06,AUTOMOTIVE,1287.0
1,2013-01-06,BABY CARE,0.0
2,2013-01-06,BEAUTY,923.0
