In [1]:
import numpy as np
import pandas as pd

***

#### hypotesis: item-store weeks without price correspond to weeks without sales (q == 0) for the corresponding item-store.

***

In [2]:
sales_train = pd.read_csv("../input/sales_train_validation.csv")
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1919 entries, id to d_1913
dtypes: int64(1913), object(6)
memory usage: 446.4+ MB


In [3]:
calendar = pd.read_csv("../input/calendar.csv", parse_dates=["date"])
calendar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          1969 non-null   datetime64[ns]
 1   wm_yr_wk      1969 non-null   int64         
 2   weekday       1969 non-null   object        
 3   wday          1969 non-null   int64         
 4   month         1969 non-null   int64         
 5   year          1969 non-null   int64         
 6   d             1969 non-null   object        
 7   event_name_1  162 non-null    object        
 8   event_type_1  162 non-null    object        
 9   event_name_2  5 non-null      object        
 10  event_type_2  5 non-null      object        
 11  snap_CA       1969 non-null   int64         
 12  snap_TX       1969 non-null   int64         
 13  snap_WI       1969 non-null   int64         
dtypes: datetime64[ns](1), int64(7), object(6)
memory usage: 215.5+ KB


In [4]:
sell_prices = pd.read_csv("../input/sell_prices.csv")
sell_prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   store_id    object 
 1   item_id     object 
 2   wm_yr_wk    int64  
 3   sell_price  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 208.8+ MB


***

In [5]:
data = pd.melt(sales_train, 
               id_vars=["item_id","store_id"],
               value_vars=[f"d_{i}" for i in range(1,1914)],
               var_name="d",
               value_name="q")

In [6]:
data.head()

Unnamed: 0,item_id,store_id,d,q
0,HOBBIES_1_001,CA_1,d_1,0
1,HOBBIES_1_002,CA_1,d_1,0
2,HOBBIES_1_003,CA_1,d_1,0
3,HOBBIES_1_004,CA_1,d_1,0
4,HOBBIES_1_005,CA_1,d_1,0


In [7]:
mrg = (data
 .merge(calendar.loc[:, ["d","wm_yr_wk"]], how="left", on="d")
 .groupby(["wm_yr_wk", "item_id", "store_id"])["q"]
 .sum()
 .reset_index()
 .merge(sell_prices, how="left")
)

In [8]:
mrg.head()

Unnamed: 0,wm_yr_wk,item_id,store_id,q,sell_price
0,11101,FOODS_1_001,CA_1,10,2.0
1,11101,FOODS_1_001,CA_2,11,2.0
2,11101,FOODS_1_001,CA_3,8,2.0
3,11101,FOODS_1_001,CA_4,5,2.0
4,11101,FOODS_1_001,TX_1,2,2.0


***

In [9]:
mrg_when_price_null = mrg[mrg.sell_price.isnull()]

In [10]:
(mrg_when_price_null.q == 0).sum() / len(mrg_when_price_null)

1.0

#### week without price -> week without sale (q == 0).

In [11]:
mrg_when_q_zero = mrg.query("q == 0")

In [12]:
mrg_when_q_zero.sell_price.isnull().sum() / len(mrg_when_q_zero)

0.5245658445360853

#### week without sales (q == 0) **do not imply** week without price.

***