#### Import Modules

In [1]:
import numpy as np
import pandas as pd
import sys, os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from read_write_util import ReadWriteUtil

In [2]:
warnings.filterwarnings('ignore')
read_write_util = ReadWriteUtil()
# Initialize Read and Write helper

#### Read Data

In [3]:
df_store = read_write_util.dvc_get_data(path="../data/store.csv", version="v1")
df_store.head()

2022-09-05 13:08:24,101:logger:../data/store.csv with version v1 Loaded


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [4]:
df_train = read_write_util.dvc_get_data(path="../data/train.csv", version='v1')
df_train.head()

2022-09-05 13:14:29,401:logger:../data/train.csv with version v1 Loaded


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


In [5]:
df_test = read_write_util.dvc_get_data("../data/test.csv", version='v1')
df_test.head()

2022-09-05 13:15:09,478:logger:../data/test.csv with version v1 Loaded


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


#### Sort and Index

In [14]:
df_test.sort_values(['Store', 'Date'], ignore_index=True, inplace=True)
df_train.sort_values(['Store', 'Date'], ignore_index=True, inplace=True)
df_test.set_index('Date', inplace=True)
df_train.set_index('Date', inplace=True)

#### Data Overview

In [15]:
print(f"SHAPE: {df_store.shape}")
df_store.info()

SHAPE: (1115, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


In [16]:
print(f"SHAPE: {df_train.shape}")
df_train.info()

SHAPE: (1017209, 8)
<class 'pandas.core.frame.DataFrame'>
Index: 1017209 entries, 2013-01-01 to 2015-07-31
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Store          1017209 non-null  int64 
 1   DayOfWeek      1017209 non-null  int64 
 2   Sales          1017209 non-null  int64 
 3   Customers      1017209 non-null  int64 
 4   Open           1017209 non-null  int64 
 5   Promo          1017209 non-null  int64 
 6   StateHoliday   1017209 non-null  object
 7   SchoolHoliday  1017209 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 69.8+ MB


In [17]:
print(f"SHAPE: {df_test.shape}")
df_test.info()

SHAPE: (41088, 7)
<class 'pandas.core.frame.DataFrame'>
Index: 41088 entries, 2015-08-01 to 2015-09-17
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             41088 non-null  int64  
 1   Store          41088 non-null  int64  
 2   DayOfWeek      41088 non-null  int64  
 3   Open           41077 non-null  float64
 4   Promo          41088 non-null  int64  
 5   StateHoliday   41088 non-null  object 
 6   SchoolHoliday  41088 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 2.5+ MB


#### Check for duplicates

In [20]:
df_train.duplicated(subset=['Store']).all()

False

In [21]:
df_test.duplicated(subset=['Store']).all()


False

In [22]:
df_store.duplicated(subset=['Store']).all()

False