In [1]:
import pandas as pd
from category_encoders.ordinal import OrdinalEncoder

In [2]:
sales_train = pd.read_csv("../input/sales_train_validation.csv")
weights_validation = pd.read_csv("../input/weights_validation.csv")

***

In [3]:
hierarchy = (sales_train.loc[:, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]]
             .drop_duplicates())

In [4]:
item_encoder = OrdinalEncoder()
item_encoder.fit(hierarchy.loc[:, ["item_id"]])
hierarchy.loc[:, "item_id"]  = item_encoder.transform(hierarchy.loc[:, ["item_id"]])

dept_encoder = OrdinalEncoder()
dept_encoder.fit(hierarchy.loc[:, ["dept_id"]])
hierarchy.loc[:, "dept_id"]  = dept_encoder.transform(hierarchy.loc[:, ["dept_id"]])

cat_encoder = OrdinalEncoder()
cat_encoder.fit(hierarchy.loc[:, ["cat_id"]])
hierarchy.loc[:, "cat_id"]   = cat_encoder.transform(hierarchy.loc[:, ["cat_id"]])

store_encoder = OrdinalEncoder()
store_encoder.fit(hierarchy.loc[:, ["store_id"]])
hierarchy.loc[:, "store_id"] = store_encoder.transform(hierarchy.loc[:, ["store_id"]])

state_encoder = OrdinalEncoder()
state_encoder.fit(hierarchy.loc[:, ["state_id"]])
hierarchy.loc[:, "state_id"] = state_encoder.transform(hierarchy.loc[:, ["state_id"]])

***
### weights for level 12

In [5]:
weights_level12 = (weights_validation.query("Level_id == 'Level12'")
                   .rename({"Agg_Level_1":"item_id", "Agg_Level_2":"store_id", "Weight":"weight"}, axis=1)
                   .drop("Level_id", axis=1)
                  )

In [6]:
weights_level12

Unnamed: 0,item_id,store_id,weight
12350,FOODS_1_001,CA_1,1.970000e-05
12351,FOODS_1_001,CA_2,1.850000e-05
12352,FOODS_1_001,CA_3,1.430000e-05
12353,FOODS_1_001,CA_4,5.380000e-06
12354,FOODS_1_001,TX_1,5.980000e-07
...,...,...,...
42835,HOUSEHOLD_2_516,TX_2,1.270000e-05
42836,HOUSEHOLD_2_516,TX_3,7.920000e-06
42837,HOUSEHOLD_2_516,WI_1,1.580000e-06
42838,HOUSEHOLD_2_516,WI_2,1.580000e-06


In [7]:
weights_level12["item_id"] = item_encoder.transform(weights_level12.loc[:, ["item_id"]])
weights_level12["store_id"] = store_encoder.transform(weights_level12.loc[:, ["store_id"]])

In [8]:
weights_level12.to_parquet("../input/weights_level12.parquet", index=False)

***
### weights for level 9

In [9]:
weights_validation.query("Level_id == 'Level9'").head()

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
84,Level9,CA_1,FOODS_1,0.005208
85,Level9,CA_1,FOODS_2,0.014867
86,Level9,CA_1,FOODS_3,0.041885
87,Level9,CA_1,HOBBIES_1,0.017208
88,Level9,CA_1,HOBBIES_2,0.000602


In [10]:
weights_level9 = (weights_validation.query("Level_id == 'Level9'")
                  .rename({"Agg_Level_1":"store_id", "Agg_Level_2":"dept_id", "Weight":"weight"}, axis=1)
                  .drop("Level_id", axis=1)
                 )

In [11]:
weights_level9.head()

Unnamed: 0,store_id,dept_id,weight
84,CA_1,FOODS_1,0.005208
85,CA_1,FOODS_2,0.014867
86,CA_1,FOODS_3,0.041885
87,CA_1,HOBBIES_1,0.017208
88,CA_1,HOBBIES_2,0.000602


In [12]:
weights_level9["store_id"] = store_encoder.transform(weights_level9.loc[:, ["store_id"]])
weights_level9["dept_id"] = dept_encoder.transform(weights_level9.loc[:, ["dept_id"]])

In [13]:
weights_level9.to_parquet("../input/weights_level9.parquet", index=False)

***
### weights for level 8: store_id,cat_id

In [18]:
weights_validation.query("Level_id == 'Level8'").head()

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
54,Level8,CA_1,FOODS,0.061961
55,Level8,CA_1,HOBBIES,0.01781
56,Level8,CA_1,HOUSEHOLD,0.031118
57,Level8,CA_2,FOODS,0.060341
58,Level8,CA_2,HOBBIES,0.012275


In [19]:
weights_level8 = (weights_validation.query("Level_id == 'Level8'")
                  .rename({"Agg_Level_1":"store_id", "Agg_Level_2":"cat_id", "Weight":"weight"}, axis=1)
                  .drop("Level_id", axis=1)
                 )
weights_level8.head()

Unnamed: 0,store_id,cat_id,weight
54,CA_1,FOODS,0.061961
55,CA_1,HOBBIES,0.01781
56,CA_1,HOUSEHOLD,0.031118
57,CA_2,FOODS,0.060341
58,CA_2,HOBBIES,0.012275


In [20]:
weights_level8["store_id"] = store_encoder.transform(weights_level8.loc[:, ["store_id"]])
weights_level8["cat_id"] = cat_encoder.transform(weights_level8.loc[:, ["cat_id"]])
weights_level8.head()

Unnamed: 0,store_id,cat_id,weight
54,1,3,0.061961
55,1,1,0.01781
56,1,2,0.031118
57,2,3,0.060341
58,2,1,0.012275


In [21]:
weights_level8.to_parquet("../input/weights_level8.parquet", index=False)

***
### weights for level 5: dept_id

In [35]:
weights_validation.query("Level_id == 'Level5'").head(10)

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
17,Level5,FOODS_1,X,0.062625
18,Level5,FOODS_2,X,0.154642
19,Level5,FOODS_3,X,0.351318
20,Level5,HOBBIES_1,X,0.122088
21,Level5,HOBBIES_2,X,0.005991
22,Level5,HOUSEHOLD_1,X,0.229594
23,Level5,HOUSEHOLD_2,X,0.073741


In [36]:
weights_level5 = (weights_validation.query("Level_id == 'Level5'")
                  .rename({"Agg_Level_1":"dept_id", "Weight":"weight"}, axis=1)
                  .drop(["Level_id","Agg_Level_2"], axis=1)
                 )
weights_level5.head(10)

Unnamed: 0,dept_id,weight
17,FOODS_1,0.062625
18,FOODS_2,0.154642
19,FOODS_3,0.351318
20,HOBBIES_1,0.122088
21,HOBBIES_2,0.005991
22,HOUSEHOLD_1,0.229594
23,HOUSEHOLD_2,0.073741


In [37]:
weights_level5["dept_id"] = dept_encoder.transform(weights_level5.loc[:, ["dept_id"]])
weights_level5.head()

Unnamed: 0,dept_id,weight
17,5,0.062625
18,6,0.154642
19,7,0.351318
20,1,0.122088
21,2,0.005991


In [38]:
weights_level5.to_parquet("../input/weights_level5.parquet", index=False)

***
### weights for level 4: cat_id

In [22]:
weights_validation.query("Level_id == 'Level4'").head()

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
14,Level4,FOODS,X,0.568586
15,Level4,HOBBIES,X,0.128079
16,Level4,HOUSEHOLD,X,0.303335


In [23]:
weights_level4 = (weights_validation.query("Level_id == 'Level4'")
                  .rename({"Agg_Level_1":"cat_id", "Weight":"weight"}, axis=1)
                  .drop(["Level_id","Agg_Level_2"], axis=1)
                 )
weights_level4.head()

Unnamed: 0,cat_id,weight
14,FOODS,0.568586
15,HOBBIES,0.128079
16,HOUSEHOLD,0.303335


In [24]:
weights_level4["cat_id"] = cat_encoder.transform(weights_level4.loc[:, ["cat_id"]])
weights_level4.head()

Unnamed: 0,cat_id,weight
14,3,0.568586
15,1,0.128079
16,2,0.303335


In [27]:
weights_level4.to_parquet("../input/weights_level4.parquet", index=False)

***
### weights for level 3: store_id

In [19]:
weights_validation.query("Level_id == 'Level3'").head()

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
4,Level3,CA_1,X,0.110888
5,Level3,CA_2,X,0.110247
6,Level3,CA_3,X,0.155628
7,Level3,CA_4,X,0.065608
8,Level3,TX_1,X,0.077561


In [20]:
weights_level3 = (weights_validation.query("Level_id == 'Level3'")
                  .rename({"Agg_Level_1":"store_id", "Weight":"weight"}, axis=1)
                  .drop(["Level_id","Agg_Level_2"], axis=1)
                 )
weights_level3.head()

Unnamed: 0,store_id,weight
4,CA_1,0.110888
5,CA_2,0.110247
6,CA_3,0.155628
7,CA_4,0.065608
8,TX_1,0.077561


In [21]:
weights_level3["store_id"] = store_encoder.transform(weights_level3.loc[:, ["store_id"]])
weights_level3.head()

Unnamed: 0,store_id,weight
4,1,0.110888
5,2,0.110247
6,3,0.155628
7,4,0.065608
8,5,0.077561


In [22]:
weights_level3.to_parquet("../input/weights_level3.parquet", index=False)

***
### weights for level 2: state_id

In [15]:
weights_validation.query("Level_id == 'Level2'").head()

Unnamed: 0,Level_id,Agg_Level_1,Agg_Level_2,Weight
1,Level2,CA,X,0.442371
2,Level2,TX,X,0.269297
3,Level2,WI,X,0.288332


In [18]:
weights_level2 = (weights_validation.query("Level_id == 'Level2'")
                  .rename({"Agg_Level_1":"state_id", "Weight":"weight"}, axis=1)
                  .drop(["Level_id","Agg_Level_2"], axis=1)
                 )
weights_level2.head()

Unnamed: 0,state_id,weight
1,CA,0.442371
2,TX,0.269297
3,WI,0.288332


In [19]:
weights_level2["state_id"] = state_encoder.transform(weights_level2.loc[:, ["state_id"]])
weights_level2.head()

Unnamed: 0,state_id,weight
1,1,0.442371
2,2,0.269297
3,3,0.288332


In [20]:
weights_level2.to_parquet("../input/weights_level2.parquet", index=False)

***