In [1]:
import pandas as pd
from category_encoders.ordinal import OrdinalEncoder

# local modules
import sys
sys.path.append("../lib/")
from utils import reduce_mem_usage

  import pandas.util.testing as tm


***

In [2]:
sales_train = pd.read_csv("../input/sales_train_evaluation.csv")
sell_prices = pd.read_csv("../input/sell_prices.csv")
calendar = pd.read_csv("../input/calendar.csv", parse_dates=["date"])

# just to validate
weights_validation = pd.read_csv("../input/weights_validation.csv")

In [3]:
sales_train["id"] = sales_train.id.map(lambda x: x.replace("_validation", ""))
hierarchy = (sales_train.loc[:, ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]]
             .drop_duplicates())

In [4]:
# hierarchy encoder
id_encoder = OrdinalEncoder()
id_encoder.fit(hierarchy.loc[:, ["id"]])
hierarchy["ts_id"]  = id_encoder.transform(hierarchy.loc[:, ["id"]])

item_encoder = OrdinalEncoder()
item_encoder.fit(hierarchy.loc[:, ["item_id"]])
hierarchy.loc[:, "item_id"]  = item_encoder.transform(hierarchy.loc[:, ["item_id"]])

dept_encoder = OrdinalEncoder()
dept_encoder.fit(hierarchy.loc[:, ["dept_id"]])
hierarchy.loc[:, "dept_id"]  = dept_encoder.transform(hierarchy.loc[:, ["dept_id"]])

cat_encoder = OrdinalEncoder()
cat_encoder.fit(hierarchy.loc[:, ["cat_id"]])
hierarchy.loc[:, "cat_id"]   = cat_encoder.transform(hierarchy.loc[:, ["cat_id"]])

store_encoder = OrdinalEncoder()
store_encoder.fit(hierarchy.loc[:, ["store_id"]])
hierarchy.loc[:, "store_id"] = store_encoder.transform(hierarchy.loc[:, ["store_id"]])

state_encoder = OrdinalEncoder()
state_encoder.fit(hierarchy.loc[:, ["state_id"]])
hierarchy.loc[:, "state_id"] = state_encoder.transform(hierarchy.loc[:, ["state_id"]])

In [5]:
sales_train["ts_id"] = id_encoder.transform(sales_train.loc[:, ["id"]])
sales_train.loc[:, "item_id"]  = item_encoder.transform(sales_train.loc[:, ["item_id"]])
sales_train.loc[:, "dept_id"]  = dept_encoder.transform(sales_train.loc[:, ["dept_id"]])
sales_train.loc[:, "cat_id"]   = cat_encoder.transform(sales_train.loc[:, ["cat_id"]])
sales_train.loc[:, "store_id"] = store_encoder.transform(sales_train.loc[:, ["store_id"]])
sales_train.loc[:, "state_id"] = state_encoder.transform(sales_train.loc[:, ["state_id"]])

In [6]:
sell_prices.loc[:, "store_id"] = store_encoder.transform(sell_prices.loc[:, ["store_id"]])
sell_prices.loc[:, "item_id"]  = item_encoder.transform(sell_prices.loc[:, ["item_id"]]) 

In [7]:
data = pd.melt(sales_train, 
               id_vars=["ts_id","item_id","dept_id","cat_id","store_id","state_id"],
               value_vars=[f"d_{i}" for i in range(1,1942)],
               var_name="d",
               value_name="q")
data = pd.merge(data, 
                calendar.loc[:, ["d","date","wm_yr_wk"]],
                how="left",
                on="d")
data = pd.merge(data, 
                sell_prices, 
                how="left", on=["store_id","item_id","wm_yr_wk"])

data.dropna(inplace=True)
data["sales"] = data.eval("q*sell_price")
data.drop(["d", "wm_yr_wk","q","sell_price"], axis=1, inplace=True)
data.rename({"date":"ds"}, axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)
data = reduce_mem_usage(data)

In [8]:
data.to_parquet("../input/weighting_input.parquet", index=False)

***
### weights for level 12: item_id, store_id

In [112]:
weights_level12 = (weights_validation.query("Level_id == 'Level12'")
                   .rename({"Agg_Level_1":"item_id", "Agg_Level_2":"store_id", "Weight":"weight"}, axis=1)
                   .drop("Level_id", axis=1)
                  )
weights_level12.head()

Unnamed: 0,item_id,store_id,weight
12350,FOODS_1_001,CA_1,1.97e-05
12351,FOODS_1_001,CA_2,1.85e-05
12352,FOODS_1_001,CA_3,1.43e-05
12353,FOODS_1_001,CA_4,5.38e-06
12354,FOODS_1_001,TX_1,5.98e-07


In [113]:
weights_level12["item_id"] = item_encoder.transform(weights_level12.loc[:, ["item_id"]])
weights_level12["store_id"] = store_encoder.transform(weights_level12.loc[:, ["store_id"]])
weights_level12.head()

Unnamed: 0,item_id,store_id,weight
12350,1613,1,1.97e-05
12351,1613,2,1.85e-05
12352,1613,3,1.43e-05
12353,1613,4,5.38e-06
12354,1613,5,5.98e-07


In [114]:
weights_level12.to_parquet("../input/weights_level12.parquet", index=False)

***
### weights for level 11: item_id, state_id

In [115]:
weights_level11 = (weights_validation.query("Level_id == 'Level11'")
                   .rename({"Agg_Level_1":"state_id", "Agg_Level_2":"item_id", "Weight":"weight"}, axis=1)
                   .drop("Level_id", axis=1)
                  )
weights_level11.head()

Unnamed: 0,state_id,item_id,weight
3203,CA,FOODS_1_001,5.8e-05
3204,CA,FOODS_1_002,0.000121
3205,CA,FOODS_1_003,7.8e-05
3206,CA,FOODS_1_004,0.0
3207,CA,FOODS_1_005,0.000296


In [116]:
weights_level11["item_id"] = item_encoder.transform(weights_level11.loc[:, ["item_id"]])
weights_level11["state_id"] = state_encoder.transform(weights_level11.loc[:, ["state_id"]])
weights_level11.head()

Unnamed: 0,state_id,item_id,weight
3203,1,1613,5.8e-05
3204,1,1614,0.000121
3205,1,1615,7.8e-05
3206,1,1616,0.0
3207,1,1617,0.000296


In [117]:
weights_level11.to_parquet("../input/weights_level11.parquet", index=False)

***
### weights for level 10: item_id

In [118]:
weights_level10 = (weights_validation.query("Level_id == 'Level10'")
                   .rename({"Agg_Level_1":"item_id", "Weight":"weight"}, axis=1)
                   .drop(["Level_id", "Agg_Level_2"], axis=1)
                  )
weights_level10.head()

Unnamed: 0,item_id,weight
154,FOODS_1_001,9.6e-05
155,FOODS_1_002,0.000273
156,FOODS_1_003,0.000124
157,FOODS_1_004,0.0
158,FOODS_1_005,0.00052


In [119]:
weights_level10["item_id"] = item_encoder.transform(weights_level10.loc[:, ["item_id"]])
weights_level10.head()

Unnamed: 0,item_id,weight
154,1613,9.6e-05
155,1614,0.000273
156,1615,0.000124
157,1616,0.0
158,1617,0.00052


In [120]:
weights_level10.to_parquet("../input/weights_level10.parquet", index=False)

***
### weights for level 9: store_id, dept_id

In [121]:
weights_level9 = (weights_validation.query("Level_id == 'Level9'")
                  .rename({"Agg_Level_1":"store_id", "Agg_Level_2":"dept_id", "Weight":"weight"}, axis=1)
                  .drop("Level_id", axis=1)
                 )
weights_level9.head()

Unnamed: 0,store_id,dept_id,weight
84,CA_1,FOODS_1,0.005208
85,CA_1,FOODS_2,0.014867
86,CA_1,FOODS_3,0.041885
87,CA_1,HOBBIES_1,0.017208
88,CA_1,HOBBIES_2,0.000602


In [122]:
weights_level9["store_id"] = store_encoder.transform(weights_level9.loc[:, ["store_id"]])
weights_level9["dept_id"] = dept_encoder.transform(weights_level9.loc[:, ["dept_id"]])
weights_level9.head()

Unnamed: 0,store_id,dept_id,weight
84,1,5,0.005208
85,1,6,0.014867
86,1,7,0.041885
87,1,1,0.017208
88,1,2,0.000602


In [123]:
weights_level9.to_parquet("../input/weights_level9.parquet", index=False)

***
### weights for level 8: store_id,cat_id

In [124]:
weights_level8 = (weights_validation.query("Level_id == 'Level8'")
                  .rename({"Agg_Level_1":"store_id", "Agg_Level_2":"cat_id", "Weight":"weight"}, axis=1)
                  .drop("Level_id", axis=1)
                 )
weights_level8.head()

Unnamed: 0,store_id,cat_id,weight
54,CA_1,FOODS,0.061961
55,CA_1,HOBBIES,0.01781
56,CA_1,HOUSEHOLD,0.031118
57,CA_2,FOODS,0.060341
58,CA_2,HOBBIES,0.012275


In [125]:
weights_level8["store_id"] = store_encoder.transform(weights_level8.loc[:, ["store_id"]])
weights_level8["cat_id"] = cat_encoder.transform(weights_level8.loc[:, ["cat_id"]])
weights_level8.head()

Unnamed: 0,store_id,cat_id,weight
54,1,3,0.061961
55,1,1,0.01781
56,1,2,0.031118
57,2,3,0.060341
58,2,1,0.012275


In [126]:
weights_level8.to_parquet("../input/weights_level8.parquet", index=False)

***
### weights for level 7: state_id, dept_id

In [127]:
weights_level7 = (weights_validation.query("Level_id == 'Level7'")
                  .rename({"Agg_Level_1":"state_id", "Agg_Level_2":"dept_id", "Weight":"weight"}, axis=1)
                  .drop("Level_id", axis=1)
                 )
weights_level7.head()

Unnamed: 0,state_id,dept_id,weight
33,CA,FOODS_1,0.027055
34,CA,FOODS_2,0.057655
35,CA,FOODS_3,0.156035
36,CA,HOBBIES_1,0.056463
37,CA,HOBBIES_2,0.002391


In [128]:
weights_level7["state_id"] = state_encoder.transform(weights_level7.loc[:, ["state_id"]])
weights_level7["dept_id"] = dept_encoder.transform(weights_level7.loc[:, ["dept_id"]])
weights_level7.head()

Unnamed: 0,state_id,dept_id,weight
33,1,5,0.027055
34,1,6,0.057655
35,1,7,0.156035
36,1,1,0.056463
37,1,2,0.002391


In [129]:
weights_level7.to_parquet("../input/weights_level7.parquet", index=False)

***
### weights for level 6: state_id, cat_id

In [130]:
weights_level6 = (weights_validation.query("Level_id == 'Level6'")
                  .rename({"Agg_Level_1":"state_id", "Agg_Level_2":"cat_id", "Weight":"weight"}, axis=1)
                  .drop("Level_id", axis=1)
                 )
weights_level6.head()

Unnamed: 0,state_id,cat_id,weight
24,CA,FOODS,0.240745
25,CA,HOBBIES,0.058855
26,CA,HOUSEHOLD,0.142772
27,TX,FOODS,0.141583
28,TX,HOBBIES,0.041293


In [131]:
weights_level6["state_id"] = state_encoder.transform(weights_level6.loc[:, ["state_id"]])
weights_level6["cat_id"] = cat_encoder.transform(weights_level6.loc[:, ["cat_id"]])
weights_level6.head()

Unnamed: 0,state_id,cat_id,weight
24,1,3,0.240745
25,1,1,0.058855
26,1,2,0.142772
27,2,3,0.141583
28,2,1,0.041293


In [132]:
weights_level6.to_parquet("../input/weights_level6.parquet", index=False)

***
### weights for level 5: dept_id

In [133]:
weights_validation.query("Level_id == 'Level5'").head(10)

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
17,Level5,FOODS_1,X,0.062625
18,Level5,FOODS_2,X,0.154642
19,Level5,FOODS_3,X,0.351318
20,Level5,HOBBIES_1,X,0.122088
21,Level5,HOBBIES_2,X,0.005991
22,Level5,HOUSEHOLD_1,X,0.229594
23,Level5,HOUSEHOLD_2,X,0.073741


In [134]:
weights_level5 = (weights_validation.query("Level_id == 'Level5'")
                  .rename({"Agg_Level_1":"dept_id", "Weight":"weight"}, axis=1)
                  .drop(["Level_id","Agg_Level_2"], axis=1)
                 )
weights_level5.head(10)

Unnamed: 0,dept_id,weight
17,FOODS_1,0.062625
18,FOODS_2,0.154642
19,FOODS_3,0.351318
20,HOBBIES_1,0.122088
21,HOBBIES_2,0.005991
22,HOUSEHOLD_1,0.229594
23,HOUSEHOLD_2,0.073741


In [135]:
weights_level5["dept_id"] = dept_encoder.transform(weights_level5.loc[:, ["dept_id"]])
weights_level5.head()

Unnamed: 0,dept_id,weight
17,5,0.062625
18,6,0.154642
19,7,0.351318
20,1,0.122088
21,2,0.005991


In [136]:
weights_level5.to_parquet("../input/weights_level5.parquet", index=False)

***
### weights for level 4: cat_id

In [137]:
weights_level4 = (weights_validation.query("Level_id == 'Level4'")
                  .rename({"Agg_Level_1":"cat_id", "Weight":"weight"}, axis=1)
                  .drop(["Level_id","Agg_Level_2"], axis=1)
                 )
weights_level4.head()

Unnamed: 0,cat_id,weight
14,FOODS,0.568586
15,HOBBIES,0.128079
16,HOUSEHOLD,0.303335


In [138]:
weights_level4["cat_id"] = cat_encoder.transform(weights_level4.loc[:, ["cat_id"]])
weights_level4.head()

Unnamed: 0,cat_id,weight
14,3,0.568586
15,1,0.128079
16,2,0.303335


In [139]:
weights_level4.to_parquet("../input/weights_level4.parquet", index=False)

***
### weights for level 3: store_id

In [140]:
weights_level3 = (weights_validation.query("Level_id == 'Level3'")
                  .rename({"Agg_Level_1":"store_id", "Weight":"weight"}, axis=1)
                  .drop(["Level_id","Agg_Level_2"], axis=1)
                 )
weights_level3.head()

Unnamed: 0,store_id,weight
4,CA_1,0.110888
5,CA_2,0.110247
6,CA_3,0.155628
7,CA_4,0.065608
8,TX_1,0.077561


In [141]:
weights_level3["store_id"] = store_encoder.transform(weights_level3.loc[:, ["store_id"]])
weights_level3.head()

Unnamed: 0,store_id,weight
4,1,0.110888
5,2,0.110247
6,3,0.155628
7,4,0.065608
8,5,0.077561


In [142]:
weights_level3.to_parquet("../input/weights_level3.parquet", index=False)

***
### weights for level 2: state_id

In [143]:
weights_level2 = (weights_validation.query("Level_id == 'Level2'")
                  .rename({"Agg_Level_1":"state_id", "Weight":"weight"}, axis=1)
                  .drop(["Level_id","Agg_Level_2"], axis=1)
                 )
weights_level2.head()

Unnamed: 0,state_id,weight
1,CA,0.442371
2,TX,0.269297
3,WI,0.288332


In [144]:
weights_level2["state_id"] = state_encoder.transform(weights_level2.loc[:, ["state_id"]])
weights_level2.head()

Unnamed: 0,state_id,weight
1,1,0.442371
2,2,0.269297
3,3,0.288332


In [145]:
weights_level2.to_parquet("../input/weights_level2.parquet", index=False)

***