#### Required python libraries

In [26]:
# pip install pandas numpy matplotlib seaborn

#### Defining the imports

In [27]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  
%matplotlib inline    
import matplotlib as mpl
import seaborn as sns
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

#### Loading the provided datasets

* Features Dataset : features.csv :- Additional data related to the store, department, and regional activity for the given dates.
* Train Dataset : train.csv :- This is the historical training data, which covers 2010–02–05 to 2012-08-17.
* Test Dataset : test.csv :- Similar to train.csv, without the weekly sales column
* Stores Dataset : stores.csv :- Contains anonymized information about the 45 stores, indicating the type and size of the store.

In [28]:
df_features = pd.read_csv("dataset/features.csv")
df_store = pd.read_csv("dataset/stores.csv")
df_train = pd.read_csv("dataset/train.csv")
df_test = pd.read_csv("dataset/train.csv")

In [29]:
# Replacing Column Name spaces with "_"
df_features.columns = df_features.columns.str.replace(' ', '_')
df_store.columns = df_store.columns.str.replace(' ', '_')
df_train.columns = df_train.columns.str.replace(' ', '_')

In [30]:
# Changing type of datetime object to datetime and sorting
df_features["Date"] = pd.to_datetime(df_features["Date"])
df_train["Date"] = pd.to_datetime(df_features["Date"])
# Sorting by Date
df_features = df_features.sort_values('Date')
df_train = df_train.sort_values('Date')

#### Exploration of Features Dataset

In [31]:
# Shape of the features data records
df_features.shape

(8190, 12)

In [32]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         8190 non-null   int64         
 1   Date          8190 non-null   datetime64[ns]
 2   Temperature   8190 non-null   float64       
 3   Fuel_Price    8190 non-null   float64       
 4   MarkDown1     4032 non-null   float64       
 5   MarkDown2     2921 non-null   float64       
 6   MarkDown3     3613 non-null   float64       
 7   MarkDown4     3464 non-null   float64       
 8   MarkDown5     4050 non-null   float64       
 9   CPI           7605 non-null   float64       
 10  Unemployment  7605 non-null   float64       
 11  IsHoliday     8190 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(9), int64(1)
memory usage: 775.8 KB


In [33]:
df_features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
2730,16,2010-02-05,19.79,2.58,,,,,,189.381697,7.039,False
5460,31,2010-02-05,39.05,2.572,,,,,,210.752605,8.324,False
3640,21,2010-02-05,39.05,2.572,,,,,,210.752605,8.324,False
4550,26,2010-02-05,9.55,2.788,,,,,,131.527903,8.488,False


In [34]:
df_features.tail()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
5823,32,2013-07-26,72.99,3.582,549.89,940.93,86.0,106.47,1530.56,,,False
2365,13,2013-07-26,83.62,3.669,346.31,1377.41,93.4,140.32,2147.06,,,False
5641,31,2013-07-26,85.0,3.62,1394.82,138.71,12.0,970.77,6859.07,,,False
6733,37,2013-07-26,83.28,3.62,178.0,11.86,,,779.32,,,False
8189,45,2013-07-26,76.06,3.804,212.02,851.73,2.06,10.88,1864.57,,,False


In [52]:
# Columns in the dataset
df_features.columns

Index(['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday'],
      dtype='object')

* Store: The Store Number
* Date: Dates of the data from 2010 February 05th  - 2013 July 26th
* Temperature: Temperature in Fahrenheit
* Fuel Price: Price of Fuel that day
* Markdown1, Markdown2, Markdown3, Markdown4: Data related to promotional markdowns at Walmart which are anonymous
* CPI: The consumer price index which measures the monthly changes in prices paid by US consumers
* Unemployement: Rate of Unemployment
* IsHoliday: Whether a particular day is a holiday

In [35]:
# Summary Statistics
df_features.describe()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
count,8190.0,8190,8190.0,8190.0,4032.0,2921.0,3613.0,3464.0,4050.0,7605.0,7605.0
mean,23.0,2011-10-31 12:00:00,59.356198,3.405992,7032.371786,3384.176594,1760.10018,3292.935886,4132.216422,172.460809,7.826821
min,1.0,2010-02-05 00:00:00,-7.29,2.472,-2781.45,-265.76,-179.26,0.22,-185.17,126.064,3.684
25%,12.0,2010-12-17 00:00:00,45.9025,3.041,1577.5325,68.88,6.6,304.6875,1440.8275,132.364839,6.634
50%,23.0,2011-10-31 12:00:00,60.71,3.513,4743.58,364.57,36.26,1176.425,2727.135,182.764003,7.806
75%,34.0,2012-09-14 00:00:00,73.88,3.743,8923.31,2153.35,163.15,3310.0075,4832.555,213.932412,8.567
max,45.0,2013-07-26 00:00:00,101.95,4.468,103184.98,104519.54,149483.31,67474.85,771448.1,228.976456,14.313
std,12.987966,,18.678607,0.431337,9262.747448,8793.583016,11276.462208,6792.329861,13086.690278,39.738346,1.877259


In [36]:
# Null Value Count in the features dataset
df_features.isnull().sum()

Store              0
Date               0
Temperature        0
Fuel_Price         0
MarkDown1       4158
MarkDown2       5269
MarkDown3       4577
MarkDown4       4726
MarkDown5       4140
CPI              585
Unemployment     585
IsHoliday          0
dtype: int64

In [37]:
# Unique Values in the features dataset
df_features.nunique()

Store             45
Date             182
Temperature     4178
Fuel_Price      1011
MarkDown1       4023
MarkDown2       2715
MarkDown3       2885
MarkDown4       3405
MarkDown5       4045
CPI             2505
Unemployment     404
IsHoliday          2
dtype: int64

#### Exploration of Store Dataset

In [38]:
df_store.shape

(45, 3)

In [39]:
df_store.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Store   45 non-null     int64 
 1   Type    45 non-null     object
 2   Size    45 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ KB


In [40]:
df_store.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [41]:
df_store.tail()

Unnamed: 0,Store,Type,Size
40,41,A,196321
41,42,C,39690
42,43,C,41062
43,44,C,39910
44,45,B,118221


In [42]:
# Summary Statistics
df_store.describe()

Unnamed: 0,Store,Size
count,45.0,45.0
mean,23.0,130287.6
std,13.133926,63825.271991
min,1.0,34875.0
25%,12.0,70713.0
50%,23.0,126512.0
75%,34.0,202307.0
max,45.0,219622.0


In [43]:
# Null Value Count in the store dataset
df_store.isnull().sum()

Store    0
Type     0
Size     0
dtype: int64

In [44]:
# Unique Values in the store dataset
df_store.nunique()

Store    45
Type      3
Size     40
dtype: int64

#### Exploration of Train Dataset

In [51]:
df_train.shape

(421570, 5)

In [45]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  int64         
 1   Dept          421570 non-null  int64         
 2   Date          8190 non-null    datetime64[ns]
 3   Weekly_Sales  421570 non-null  float64       
 4   IsHoliday     421570 non-null  bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2)
memory usage: 16.5 MB


In [46]:
df_train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
2730,1,21,2010-02-05,8844.39,False
5460,1,41,2010-02-05,663.0,False
3640,1,27,2010-02-05,1752.13,False
4550,1,34,2010-02-05,12571.5,False


In [47]:
df_train.tail()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
421565,45,98,NaT,508.37,False
421566,45,98,NaT,628.1,False
421567,45,98,NaT,1061.02,False
421568,45,98,NaT,760.01,False
421569,45,98,NaT,1076.8,False


In [48]:
# Summary Statistics
df_train.describe()

Unnamed: 0,Store,Dept,Date,Weekly_Sales
count,421570.0,421570.0,8190,421570.0
mean,22.200546,44.260317,2011-10-31 12:00:00,15981.258123
min,1.0,1.0,2010-02-05 00:00:00,-4988.94
25%,11.0,18.0,2010-12-17 00:00:00,2079.65
50%,22.0,37.0,2011-10-31 12:00:00,7612.03
75%,33.0,74.0,2012-09-14 00:00:00,20205.8525
max,45.0,99.0,2013-07-26 00:00:00,693099.36
std,12.785297,30.492054,,22711.183519


In [49]:
# Null Value Count in the features dataset
df_train.isnull().sum()

Store                0
Dept                 0
Date            413380
Weekly_Sales         0
IsHoliday            0
dtype: int64

In [50]:
# Unique Values in the features dataset
df_train.nunique()

Store               45
Dept                81
Date               182
Weekly_Sales    359464
IsHoliday            2
dtype: int64