### Load packages

In [1]:
import pandas as pd
import numpy as np

import sweetviz as sv
import seaborn as sns
import plotly.express as px

### Load Data

In [2]:
features = pd.read_csv("data/features.csv")
stores = pd.read_csv("data/stores.csv")
sampleSubmission = pd.read_csv("data/sampleSubmission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

#### Features

In [3]:
features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [4]:
report = sv.analyze(features)
report.show_html("Report_Features.html")

                                             |          | [  0%]   00:00 -> (? left)

Report Report_Features.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


#### Features vs Train vs Test

In [22]:
print(train_df[['Store', 'Date', 'IsHoliday']].drop_duplicates().shape)
print(test_df[['Store', 'Date', 'IsHoliday']].drop_duplicates().shape)
print(features[['Store', 'Date', 'IsHoliday']].drop_duplicates().shape)

# rows of train + test = rows of features

(6435, 3)
(1755, 3)
(8190, 3)


#### Make sure one Date has one Holiday

In [5]:
assert features.groupby(['Date'])['IsHoliday'].nunique().reset_index().sort_values('IsHoliday').max().all() == 1

#### Stores

In [6]:
stores.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [7]:
report = sv.analyze(stores)
report.show_html("Report_Stores.html")

                                             |          | [  0%]   00:00 -> (? left)

Report Report_Stores.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


##### Stores Types vs Size

In [8]:
# stores.groupby(['Type'])['Size'].agg(['mean', 'min', 'max']).reset_index()
fig = px.box(stores, x="Type", y="Size")
fig.show()

##### Dept per Store

In [34]:
print(train_df['Store'].nunique(), test_df['Store'].nunique())
display(train_df.groupby('Store').agg({'Dept':'nunique'}).reset_index().agg({'Dept':['min', 'max', 'mean']}))
display(test_df.groupby('Store').agg({'Dept':'nunique'}).reset_index().agg({'Dept':['min', 'max', 'mean']}))

45 45


Unnamed: 0,Dept
min,61.0
max,79.0
mean,74.022222


Unnamed: 0,Dept
min,56.0
max,76.0
mean,70.422222


## Merge into one df

In [23]:
final_train = train_df.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
final_train = final_train.merge(stores, on=['Store'], how='left')
final_train.head()


(421570, 14)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,A,151315
2,1,1,2010-02-19,41595.55,False,39.93,2.514,,,,,,211.289143,8.106,A,151315
3,1,1,2010-02-26,19403.54,False,46.63,2.561,,,,,,211.319643,8.106,A,151315
4,1,1,2010-03-05,21827.9,False,46.5,2.625,,,,,,211.350143,8.106,A,151315


In [35]:
final_test = test_df.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
final_test = final_test.merge(stores, on=['Store'], how='left')
final_test.head()


Unnamed: 0,Store,Dept,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,1,2012-11-02,False,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.462779,6.573,A,151315
1,1,1,2012-11-09,False,61.24,3.314,11421.32,3370.89,40.28,4646.79,6154.16,223.481307,6.573,A,151315
2,1,1,2012-11-16,False,52.92,3.252,9696.28,292.1,103.78,1133.15,6612.69,223.512911,6.573,A,151315
3,1,1,2012-11-23,True,56.23,3.211,883.59,4.17,74910.32,209.91,303.32,223.561947,6.573,A,151315
4,1,1,2012-11-30,False,52.34,3.207,2460.03,,3838.35,150.57,6966.34,223.610984,6.573,A,151315


In [38]:
final_train[final_train.MarkDown1.isnull()].shape

(270889, 16)

In [45]:
train_null_perc = final_train.isnull().mean()*100
fig = px.bar(x = train_null_perc.index, y=train_null_perc.values, 
             labels={'x': 'Column', 'y': 'Percentage of Nulls'},
             title='Percentage of Null Values in Each TRAIN Column')
fig.show()

In [46]:
test_null_perc = final_test.isnull().mean()*100
fig = px.bar(x = test_null_perc.index, y=test_null_perc.values, 
             labels={'x': 'Column', 'y': 'Percentage of Nulls'},
             title='Percentage of Null Values in Each TEST Column')
fig.show()