## **Preparing the dataset LEGO catalog for merging**

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

### **INDEX**
1. Setup Notebook 
2. Explore data set
3. Query table
4. Export data

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

### **1. SETUP NOTEBOOK**

In [7]:
# import libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [8]:
# get the current working directory as a Path object
path = os.getcwd()

# update the path to parent folder
path = os.path.abspath(os.path.join(path, os.pardir))

In [9]:
# import data set LEGO catalog
df_col = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'colors.csv'))
df_elm = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'elements.csv'))
df_inv = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'inventories.csv'))
df_invparts = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'inventory_parts.csv'))
df_invsets = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'inventory_sets.csv'))
df_mf = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'minifigs.csv'))
df_partcat = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'part_categories.csv'))
df_partrel = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'part_relationships.csv'))
df_parts = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'parts.csv'))
df_sets = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'sets.csv'))
df_themes = pd.read_csv(os.path.join(path, '2 Data', 'Original Data', 'themes.csv'))

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

### **2. EXPLORE DATA SET LEGO CATALOG**

In [12]:
df_sets.shape

(23969, 6)

In [13]:
df_invparts.shape

(1287021, 6)

In [14]:
df_parts.head()

Unnamed: 0,part_num,name,part_cat_id,part_material
0,3381,Sticker Sheet for Set 663-1,58,Plastic
1,3383,"Sticker Sheet for Sets 618-1, 628-2",58,Plastic
2,3402,"Sticker Sheet for Sets 310-3, 311-1, 312-3",58,Plastic
3,3429,Sticker Sheet for Set 1550-1,58,Plastic
4,3432,"Sticker Sheet for Sets 357-1, 355-1, 940-1",58,Plastic


In [15]:
df_sets.head()

Unnamed: 0,set_num,name,year,theme_id,num_parts,img_url
0,0003977811-1,Ninjago: Book of Adventures,2022,761,1,https://cdn.rebrickable.com/media/sets/0003977...
1,001-1,Gears,1965,756,43,https://cdn.rebrickable.com/media/sets/001-1.jpg
2,0011-2,Town Mini-Figures,1979,67,12,https://cdn.rebrickable.com/media/sets/0011-2.jpg
3,0011-3,Castle 2 for 1 Bonus Offer,1987,199,0,https://cdn.rebrickable.com/media/sets/0011-3.jpg
4,0012-1,Space Mini-Figures,1979,143,12,https://cdn.rebrickable.com/media/sets/0012-1.jpg


In [16]:
df_invsets.head()

Unnamed: 0,inventory_id,set_num,quantity
0,35,75911-1,1
1,35,75912-1,1
2,39,75048-1,1
3,39,75053-1,1
4,50,4515-1,1


In [17]:
df_invparts.tail()

Unnamed: 0,inventory_id,part_num,color_id,quantity,is_spare,img_url
1287016,242825,63868,72,2,f,https://cdn.rebrickable.com/media/parts/elemen...
1287017,242825,65578,71,2,f,https://cdn.rebrickable.com/media/parts/elemen...
1287018,242825,66906,0,1,f,https://cdn.rebrickable.com/media/parts/elemen...
1287019,242825,73562,5,1,f,https://cdn.rebrickable.com/media/parts/elemen...
1287020,242825,98138pr0370,191,2,f,https://cdn.rebrickable.com/media/parts/elemen...


In [18]:
df_partcat[df_partcat['id'] == 21]

Unnamed: 0,id,name
18,21,Plates Round Curved and Dishes


In [19]:
df_parts['part_material'].unique()

array(['Plastic', 'Rubber', 'Cardboard/Paper', 'Cloth',
       'Flexible Plastic', 'Metal', 'Foam'], dtype=object)

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

### **3.QUERY TABLE**

**Preprare sets to become baseline table and clean it to keep lego sets only**

In [23]:
#perfom mandatory duplicated and missing value check
df_sets.duplicated().sum()

0

In [24]:
df_sets.isnull().sum()

set_num      0
name         0
year         0
theme_id     0
num_parts    0
img_url      0
dtype: int64

In [25]:
#considering  set that contains 0 parts as non-set LEGO product. 
df_sets_pos = df_sets[df_sets['num_parts'] > 0]

In [26]:
#check new number of rows
f"Number of rows in sets: {df_sets.shape[0]} and number of rows on filtered sets {df_sets_pos.shape[0]}"

'Number of rows in sets: 23969 and number of rows on filtered sets 18012'

In [27]:
#confirm num_parts 0 starts at 1
df_sets_pos.sort_values(by='num_parts', ascending=True).head()

Unnamed: 0,set_num,name,year,theme_id,num_parts,img_url
0,0003977811-1,Ninjago: Book of Adventures,2022,761,1,https://cdn.rebrickable.com/media/sets/0003977...
13025,620-3,Blue Building Plate 32 x 32,2010,35,1,https://cdn.rebrickable.com/media/sets/620-3.jpg
13039,6213-1,Replacement Gearbox for Electric Motor,1985,456,1,https://cdn.rebrickable.com/media/sets/6213-1.jpg
13115,626-1,"Baseplate, Green",1996,473,1,https://cdn.rebrickable.com/media/sets/626-1.jpg
13129,627-1,"Baseplate, Blue",1996,473,1,https://cdn.rebrickable.com/media/sets/627-1.jpg


In [28]:
#identify categories that are not sets and clean them from sets df
df_themes[['id', 'name']].drop_duplicates().sort_values('id')

Unnamed: 0,id,name
0,1,Technic
1,3,Competition
2,4,Expert Builder
3,16,RoboRiders
4,17,Speed Slammers
...,...,...
461,764,The Legend of Zelda
462,765,Wicked
463,766,Fortnite
464,767,Dungeons & Dragons


> drop theme_id: 440, 441, 501, 502, 503, 504, 516, 523, 526, 734, 735, 736, 737, 739, 740, 741, 742, 756, 757, 758, 759, 760, 761

In [30]:
#create collection of theme_id for simple drop
dropped_themes = 440, 441, 501, 502, 503, 504, 516, 523, 526, 734, 735, 736, 737, 739, 740, 741, 742, 756, 757, 758, 759, 760, 761

In [31]:
df_setsonly= df_sets_pos[~df_sets_pos['theme_id'].isin(dropped_themes)]

In [32]:
df_setsonly

Unnamed: 0,set_num,name,year,theme_id,num_parts,img_url
2,0011-2,Town Mini-Figures,1979,67,12,https://cdn.rebrickable.com/media/sets/0011-2.jpg
4,0012-1,Space Mini-Figures,1979,143,12,https://cdn.rebrickable.com/media/sets/0012-1.jpg
5,0013-1,Space Mini-Figures,1979,143,12,https://cdn.rebrickable.com/media/sets/0013-1.jpg
6,0014-1,Space Mini-Figures,1979,143,2,https://cdn.rebrickable.com/media/sets/0014-1.jpg
7,0015-1,Space Mini-Figures,1979,143,18,https://cdn.rebrickable.com/media/sets/0015-1.jpg
...,...,...,...,...,...,...
23952,WIZARD-PORTRAITS-1,Wizard Portraits Tiles,2024,746,14,https://cdn.rebrickable.com/media/sets/wizard-...
23961,XMASTREE-1,Christmas Tree,2019,410,26,https://cdn.rebrickable.com/media/sets/xmastre...
23962,XWING-1,Mini X-Wing Fighter,2019,158,60,https://cdn.rebrickable.com/media/sets/xwing-1...
23963,XWING-2,X-Wing Trench Run,2019,158,52,https://cdn.rebrickable.com/media/sets/xwing-2...


In [33]:
#check new number of rows
f"Number of rows in sets: {df_sets_pos.shape[0]} and number of rows on filtered sets {df_setsonly.shape}"

'Number of rows in sets: 18012 and number of rows on filtered sets (16493, 6)'

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

**CREATE AGGREGATED VALUES**

In [36]:
#count the total number of True values of is_spare to get the sum of spare parts per inventory
#convert data type
df_invparts['is_spare'] = df_invparts['is_spare'].astype(bool)

#count true values of data type summed by inventory
df_spares_count = df_invparts.groupby('inventory_id').agg(sum_spares=('is_spare', 'sum')).reset_index()

In [37]:
df_unique_colors = df_invparts.groupby('inventory_id').agg(num_colors=('color_id', 'nunique')).reset_index()

In [38]:
#merging tables to build connection between part_num and part_material
df_unique_materials = pd.merge(df_invparts, df_parts, on='part_num', how='left')

In [39]:
df_tot_materials = df_unique_materials.groupby('inventory_id').agg(num_materials=('part_material', 'nunique')).reset_index()

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

In [41]:
#create table for aggregated columns
df_agg_columns = pd.merge(df_unique_colors, df_tot_materials, on='inventory_id', how='left')
df_agg_columns = pd.merge(df_agg_columns, df_spares_count, on='inventory_id', how='left')

In [42]:
df_agg_columns

Unnamed: 0,inventory_id,num_colors,num_materials,sum_spares
0,1,5,1,5
1,3,9,1,25
2,4,4,1,12
3,15,1,1,2
4,16,2,1,4
...,...,...,...,...
34378,242776,2,1,8
34379,242813,14,2,67
34380,242822,6,1,22
34381,242823,9,1,28


In [43]:
#creating base table for set_num and inventory_id 
df_base = pd.merge(df_inv, df_setsonly, on='set_num', how='left')

In [44]:
#baseline for set_num 
df_setsonly.shape

(16493, 6)

In [45]:
df_base

Unnamed: 0,id,version,set_num,name,year,theme_id,num_parts,img_url
0,1,1,7922-1,McDonald's Sports Set Number 6 - Orange Vest S...,2004.0,460.0,5.0,https://cdn.rebrickable.com/media/sets/7922-1.jpg
1,3,1,3931-1,Emma's Splash Pool,2012.0,494.0,43.0,https://cdn.rebrickable.com/media/sets/3931-1.jpg
2,4,1,6942-1,Zo Weevil,1999.0,134.0,20.0,https://cdn.rebrickable.com/media/sets/6942-1.jpg
3,15,1,5158-1,"T-Junction, Circle Plates",1990.0,443.0,2.0,https://cdn.rebrickable.com/media/sets/5158-1.jpg
4,16,1,903-1,Train Wheels and Couplers,1969.0,371.0,10.0,https://cdn.rebrickable.com/media/sets/903-1.jpg
...,...,...,...,...,...,...,...,...
40559,242813,3,9786-1,Robo Technology Set [USB Cable],2003.0,520.0,221.0,https://cdn.rebrickable.com/media/sets/9786-1.jpg
40560,242822,1,fig-015530,,,,,
40561,242823,1,fig-015531,,,,,
40562,242825,1,fig-015532,,,,,


In [46]:
#dropping columns
df_base = df_base.drop(columns=['version', 'name', 'year', 'img_url'])

In [47]:
#rename id for clarity to inventory_id
df_base = df_base.rename(columns={'id': 'inventory_id'})

In [48]:
#check data types for merge
df_base.dtypes

inventory_id      int64
set_num          object
theme_id        float64
num_parts       float64
dtype: object

In [49]:
df_agg_columns.dtypes

inventory_id     int64
num_colors       int64
num_materials    int64
sum_spares       int64
dtype: object

In [50]:
df_base.shape

(40564, 4)

In [51]:
df_agg_columns.shape

(34383, 4)

In [52]:
#merging filtered variables with aggregated variables
df_catalog_sub = pd.merge(df_base, df_agg_columns, on='inventory_id', how='left')

In [53]:
#check total number of rows
df_catalog_sub.shape

(40564, 7)

In [54]:
#check for duplicates
df_catalog_sub.duplicated().sum()

0

In [55]:
#check for missing values
df_catalog_sub.isnull().sum()

inventory_id         0
set_num              0
theme_id         22414
num_parts        22414
num_colors        6181
num_materials     6181
sum_spares        6181
dtype: int64

> I will keep the rows with NaNs and reassess after merging, since the LEGO sales table is the primary table and might already filter out the missing values. 

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

### **4. EXPORT DATA**

In [59]:
#export to csv
df_catalog_sub.to_csv(os.path.join(path, '2 Data','Prepared data', 'lego_catalog_sub.csv'))