In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
from datetime import timedelta

%matplotlib inline
import matplotlib.pyplot as plt

### Load all dataframes

In [2]:
oil_df = pd.read_csv("oil.csv")
holiday_df = pd.read_csv("holidays_events.csv")
stores_df = pd.read_csv("stores.csv")
train_df = pd.read_csv("train.csv")
transactions_df = pd.read_csv("transactions.csv")

### Merge into one, without holidays yet

In [3]:
train_df = train_df.merge(oil_df, how = 'left', on = 'date')
train_df = train_df.merge(stores_df, how = 'left', on = 'store_nbr')

In [4]:
train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,type,cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,,Quito,Pichincha,D,13
1,1,2013-01-01,1,BABY CARE,0.0,0,,Quito,Pichincha,D,13
2,2,2013-01-01,1,BEAUTY,0.0,0,,Quito,Pichincha,D,13
3,3,2013-01-01,1,BEVERAGES,0.0,0,,Quito,Pichincha,D,13
4,4,2013-01-01,1,BOOKS,0.0,0,,Quito,Pichincha,D,13


In [5]:
# initialize holiday location types
train_df['local_holiday'] = 0
train_df['reg_holiday'] = 0
train_df['nat_holiday'] = 0

# initialize store types
train_df['type_a'] = 0
train_df['type_b'] = 0
train_df['type_c'] = 0
train_df['type_d'] = 0
train_df['type_e'] = 0

# initialize holiday types
train_df['holiday_day'] = 0
train_df['addtnl_day'] = 0
train_df['bridge_day'] = 0
train_df['work_day'] = 0
train_df['transfer_day'] = 0

# initialize transferred boolean
train_df['transfer_bool'] = 0

train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,type,...,type_b,type_c,type_d,type_e,holiday_day,addtnl_day,bridge_day,work_day,transfer_day,transfer_bool
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0
1,1,2013-01-01,1,BABY CARE,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0
2,2,2013-01-01,1,BEAUTY,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0
4,4,2013-01-01,1,BOOKS,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0


In [6]:
prod_df = pd.get_dummies(train_df.family)
prod_df.head()

Unnamed: 0,AUTOMOTIVE,BABY CARE,BEAUTY,BEVERAGES,BOOKS,BREAD/BAKERY,CELEBRATION,CLEANING,DAIRY,DELI,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
train_df = train_df.join(prod_df)

train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,type,...,MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES,SEAFOOD
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0
1,1,2013-01-01,1,BABY CARE,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0
2,2,2013-01-01,1,BEAUTY,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0
4,4,2013-01-01,1,BOOKS,0.0,0,,Quito,Pichincha,D,...,0,0,0,0,0,0,0,0,0,0


In [8]:
for index, row in holiday_df.iterrows():
    
    if row['transferred'] == True:
        
        train_df['transfer_bool'][train_df['date'] == row['date']] = 1
        
    if row['locale'] == 'National':
        
        train_df['nat_holiday'][train_df['date'] == row['date']] = 1
        
        if row['type'] == 'Holiday':
            train_df['holiday_day'][train_df['date'] == row['date']] = 1  
        elif row['type'] == 'Additional':
            train_df['addtnl_day'][train_df['date'] == row['date']] = 1   
        elif row['type'] == 'Bridge':
            train_df['bridge_day'][train_df['date'] == row['date']] = 1
        elif row['type'] == 'Work':
            train_df['work_day'][train_df['date'] == row['date']] = 1
        elif row['type'] == 'Transfer':
            train_df['transfer_day'][train_df['date'] == row['date']] = 1
        
    elif row['locale'] == 'Regional':
        
        train_df['reg_holiday'][(train_df['date'] == row['date']) 
                                & (train_df['state'] == row['locale_name'])] = 1
        
        if row['type'] == 'Holiday':
            train_df['holiday_day'][(train_df['date'] == row['date']) 
                                    & (train_df['state'] == row['locale_name'])] = 1
        elif row['type'] == 'Additional':
            train_df['addtnl_day'][(train_df['date'] == row['date']) 
                                   & (train_df['state'] == row['locale_name'])] = 1
        elif row['type'] == 'Bridge':
            train_df['bridge_day'][(train_df['date'] == row['date']) 
                                   & (train_df['state'] == row['locale_name'])] = 1
        elif row['type'] == 'Work':
            train_df['work_day'][(train_df['date'] == row['date']) 
                                 & (train_df['state'] == row['locale_name'])] = 1
        elif row['type'] == 'Transfer':
            train_df['transfer_day'][(train_df['date'] == row['date']) 
                                     & (train_df['state'] == row['locale_name'])] = 1
        
    elif row['locale'] == 'Local':
        
        train_df['local_holiday'][(train_df['date'] == row['date']) 
                                  & (train_df['city'] == row['locale_name'])] = 1
        
        if row['type'] == 'Holiday':
            train_df['holiday_day'][(train_df['date'] == row['date']) 
                                    & (train_df['city'] == row['locale_name'])] = 1   
        elif row['type'] == 'Additional':
            train_df['addtnl_day'][(train_df['date'] == row['date']) 
                                   & (train_df['city'] == row['locale_name'])] = 1   
        elif row['type'] == 'Bridge':
            train_df['bridge_day'][(train_df['date'] == row['date']) 
                                   & (train_df['city'] == row['locale_name'])] = 1
        elif row['type'] == 'Work':
            train_df['work_day'][(train_df['date'] == row['date']) 
                                 & (train_df['city'] == row['locale_name'])] = 1   
        elif row['type'] == 'Transfer':
            train_df['transfer_day'][(train_df['date'] == row['date']) 
                                     & (train_df['city'] == row['locale_name'])] = 1
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['local_holiday'][(train_df['date'] == row['date'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['holiday_day'][(train_df['date'] == row['date'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['reg_holiday'][(train_df['date'] == row['date'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

In [9]:
for index, row in stores_df.iterrows():
    
    if row['type'] == "A":
        train_df['type_a'][train_df['store_nbr'] == row['store_nbr']] = 1
    elif row['type'] == "B":
        train_df['type_b'][train_df['store_nbr'] == row['store_nbr']] = 1  
    elif row['type'] == "C":
        train_df['type_c'][train_df['store_nbr'] == row['store_nbr']] = 1 
    elif row['type'] == "D":
        train_df['type_d'][train_df['store_nbr'] == row['store_nbr']] = 1  
    elif row['type'] == "E":
        train_df['type_e'][train_df['store_nbr'] == row['store_nbr']] = 1
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['type_d'][train_df['store_nbr'] == row['store_nbr']] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['type_d'][train_df['store_nbr'] == row['store_nbr']] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['type_d'][train_df['store_nbr'] == row['store_nbr']] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ret

In [10]:
train_df = train_df.drop(columns=['type', 'city', 'state', 'cluster', 'id', 'family', 'SEAFOOD'])
train_df = train_df.rename(columns={'dcoilwtico': "oil_price"})

train_df.head()

Unnamed: 0,date,store_nbr,sales,onpromotion,oil_price,local_holiday,reg_holiday,nat_holiday,type_a,type_b,...,"LIQUOR,WINE,BEER",MAGAZINES,MEATS,PERSONAL CARE,PET SUPPLIES,PLAYERS AND ELECTRONICS,POULTRY,PREPARED FOODS,PRODUCE,SCHOOL AND OFFICE SUPPLIES
0,2013-01-01,1,0.0,0,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2013-01-01,1,0.0,0,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2013-01-01,1,0.0,0,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013-01-01,1,0.0,0,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2013-01-01,1,0.0,0,,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
