# Data cleaning

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load the data

In [4]:
df = pd.read_csv("../data/work.csv", parse_dates=["date"], index_col="date")
df

Unnamed: 0_level_0,state_id,store_id,dept_id,cat_id,item_id,wm_yr_wk,d,sales,sell_price,year,month,wday,weekday,event_name_1,event_type_1,event_name_2,event_type_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_090,11249,d_704,0,1.25,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_120,11249,d_704,0,4.98,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_202,11249,d_704,20,4.28,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_252,11249,d_704,34,1.48,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_288,11249,d_704,0,,2013,1,4,Tuesday,NewYear,National,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_329,11544,d_1767,5,2.98,2015,11,3,Monday,,,,
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_555,11544,d_1767,4,1.68,2015,11,3,Monday,,,,
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_586,11544,d_1767,9,1.68,2015,11,3,Monday,,,,
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_587,11544,d_1767,13,2.48,2015,11,3,Monday,,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 21280 entries, 2013-01-01 to 2015-11-30
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   state_id      21280 non-null  object 
 1   store_id      21280 non-null  object 
 2   dept_id       21280 non-null  object 
 3   cat_id        21280 non-null  object 
 4   item_id       21280 non-null  object 
 5   wm_yr_wk      21280 non-null  int64  
 6   d             21280 non-null  object 
 7   sales         21280 non-null  int64  
 8   sell_price    21209 non-null  float64
 9   year          21280 non-null  int64  
 10  month         21280 non-null  int64  
 11  wday          21280 non-null  int64  
 12  weekday       21280 non-null  object 
 13  event_name_1  1700 non-null   object 
 14  event_type_1  1700 non-null   object 
 15  event_name_2  60 non-null     object 
 16  event_type_2  60 non-null     object 
dtypes: float64(1), int64(5), object(11)
memory usage: 2.

#### Conclusion:

- year, month and wday should be categorical data, not numerical

In [8]:
df = df.astype({"year":"O",
           "month":"O",
           "wday":"O"})
df.dtypes

state_id         object
store_id         object
dept_id          object
cat_id           object
item_id          object
wm_yr_wk          int64
d                object
sales             int64
sell_price      float64
year             object
month            object
wday             object
weekday          object
event_name_1     object
event_type_1     object
event_name_2     object
event_type_2     object
dtype: object

## Unique values

In [9]:
df.nunique().sort_values()

state_id           1
cat_id             1
dept_id            1
event_type_2       2
store_id           2
event_name_2       3
year               3
event_type_1       4
wday               7
weekday            7
item_id           10
month             12
sell_price        23
event_name_1      30
wm_yr_wk         153
sales            301
d               1064
dtype: int64

#### state_id, cat_id and dept_id only have one value. We can remove these features from the dataframe.

In [10]:
df.drop(columns=["state_id", "cat_id", "dept_id"], inplace=True)

## Duplicate values

In [12]:
df.duplicated().sum()

0

## Separate categorical from numerical data

In [13]:
cat = df.select_dtypes(exclude = 'number').copy()
cat

Unnamed: 0_level_0,store_id,item_id,d,year,month,wday,weekday,event_name_1,event_type_1,event_name_2,event_type_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-01-01,CA_3,FOODS_3_090,d_704,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA_3,FOODS_3_120,d_704,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA_3,FOODS_3_202,d_704,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA_3,FOODS_3_252,d_704,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA_3,FOODS_3_288,d_704,2013,1,4,Tuesday,NewYear,National,,
...,...,...,...,...,...,...,...,...,...,...,...
2015-11-30,CA_4,FOODS_3_329,d_1767,2015,11,3,Monday,,,,
2015-11-30,CA_4,FOODS_3_555,d_1767,2015,11,3,Monday,,,,
2015-11-30,CA_4,FOODS_3_586,d_1767,2015,11,3,Monday,,,,
2015-11-30,CA_4,FOODS_3_587,d_1767,2015,11,3,Monday,,,,


In [14]:
num = df.select_dtypes(include='number').copy()
num

Unnamed: 0_level_0,wm_yr_wk,sales,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01,11249,0,1.25
2013-01-01,11249,0,4.98
2013-01-01,11249,20,4.28
2013-01-01,11249,34,1.48
2013-01-01,11249,0,
...,...,...,...
2015-11-30,11544,5,2.98
2015-11-30,11544,4,1.68
2015-11-30,11544,9,1.68
2015-11-30,11544,13,2.48


## Categorical data

### Nulls

In [16]:
cat.isna().sum().sort_values(ascending=False)

event_name_2    21220
event_type_2    21220
event_name_1    19580
event_type_1    19580
store_id            0
item_id             0
d                   0
year                0
month               0
wday                0
weekday             0
dtype: int64

#### There's something going on with event_name and event_type.

In [18]:
cat["event_name_1"].value_counts(dropna=False)

NaN                    19580
NewYear                   60
Mother's day              60
Thanksgiving              60
VeteransDay               60
Halloween                 60
EidAlAdha                 60
ColumbusDay               60
LaborDay                  60
Eid al-Fitr               60
Ramadan starts            60
IndependenceDay           60
NBAFinalsEnd              60
NBAFinalsStart            60
MemorialDay               60
Pesach End                60
Easter                    60
StPatricksDay             60
Purim End                 60
LentWeek2                 60
PresidentsDay             60
ValentinesDay             60
LentStart                 60
SuperBowl                 60
MartinLutherKingDay       60
OrthodoxChristmas         60
OrthodoxEaster            40
Father's day              40
Chanukah End              40
Christmas                 40
Cinco De Mayo             40
Name: event_name_1, dtype: int64

In [19]:
cat["event_name_2"].value_counts(dropna=False)

NaN               21220
Cinco De Mayo        20
OrthodoxEaster       20
Father's day         20
Name: event_name_2, dtype: int64

In [20]:
cat.event_type_1.value_counts(dropna=False)

NaN          19580
National       580
Religious      560
Cultural       380
Sporting       180
Name: event_type_1, dtype: int64

In [21]:
cat.event_type_2.value_counts(dropna=False)

NaN          21220
Cultural        40
Religious       20
Name: event_type_2, dtype: int64

In [22]:
pd.crosstab(index=cat['event_name_1'], columns=cat['event_name_2'])

event_name_2,Cinco De Mayo,Father's day,OrthodoxEaster
event_name_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Easter,0,0,20
NBAFinalsEnd,0,20,0
OrthodoxEaster,20,0,0


#### Conclusion:

- event_name_2 includes events from event_name_1
- it can be seen from the crosstab that they're not corresponded on each dataset
- we don't know why it happens so we're gonna get rid of event_name_2 and event_type_2
- the null values from event_name_1 and event_type_1 are days without event, so we're going to fill them with "no_event"

In [23]:
cat.drop(columns=["event_name_2","event_type_2"], inplace=True)

In [26]:
cat[["event_name_1","event_type_1"]] = cat[["event_name_1","event_type_1"]].fillna("no_event")

#### Check the nulls are fixed

In [27]:
cat.isna().sum()

store_id        0
item_id         0
d               0
year            0
month           0
wday            0
weekday         0
event_name_1    0
event_type_1    0
dtype: int64

## Outliers

#### Low frequency values

In [30]:
for i in cat:
    print(i + "\n")
    print(cat[i].value_counts(dropna = False))
    print("\n")

store_id

CA_3    10640
CA_4    10640
Name: store_id, dtype: int64


item_id

FOODS_3_090    2128
FOODS_3_120    2128
FOODS_3_202    2128
FOODS_3_252    2128
FOODS_3_288    2128
FOODS_3_329    2128
FOODS_3_555    2128
FOODS_3_586    2128
FOODS_3_587    2128
FOODS_3_714    2128
Name: item_id, dtype: int64


d

d_704     20
d_1418    20
d_1404    20
d_1405    20
d_1406    20
          ..
d_1064    20
d_1065    20
d_1066    20
d_1067    20
d_1767    20
Name: d, Length: 1064, dtype: int64


year

2013    7300
2014    7300
2015    6680
Name: year, dtype: int64


month

1     1860
3     1860
5     1860
7     1860
8     1860
10    1860
4     1800
6     1800
9     1800
11    1800
2     1680
12    1240
Name: month, dtype: int64


wday

4    3040
5    3040
6    3040
7    3040
1    3040
2    3040
3    3040
Name: wday, dtype: int64


weekday

Tuesday      3040
Wednesday    3040
Thursday     3040
Friday       3040
Saturday     3040
Sunday       3040
Monday       3040
Name: weekday, dtype: int64


e

#### Conclusion:

- There are no rare categories. We can see how there's less data for 2015 but that's just the validation dataset we removed from the previous Notebook.

## Numerical data

### Basic statistics

In [31]:
num.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
wm_yr_wk,21280.0,11415.022556,82.874752,11249.0,11335.0,11420.0,11506.0,11544.0
sales,21280.0,28.874577,38.988147,0.0,6.0,18.0,37.0,763.0
sell_price,21209.0,2.397254,1.237024,1.0,1.5,1.58,2.98,4.98


### Nulls

In [32]:
num.isna().sum()

wm_yr_wk       0
sales          0
sell_price    71
dtype: int64