# Lego Inventory Creation Dataset
<div style="
    border: 5px solid purple;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [1]:
import pandas as pd
import numpy as np

### Importing data
<div style="
    border: 2px solid orange;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [2]:
url = "https://docs.google.com/spreadsheets/d/17o2TJJ3_pmrFsFNIdhxyPW3PFO0zyksoSVbrWzrDJoU/export?format=xlsx"
dataset = pd.ExcelFile(url, engine='openpyxl')

# Parse all sheets into a dictionary
sheets = {sheet: dataset.parse(sheet) for sheet in dataset.sheet_names}

# Merge all sheets into one DataFrame
lego_dataset = pd.concat(sheets.values(), ignore_index=True)

# Display info
lego_dataset

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock,transparent
0,,Dark Blue,True,Brick,Rectangle,2 x 4,8,False,,1,
1,,Green,True,Brick,Rectangle,2 x 4,8,False,,1,
2,,Coral,True,Brick,Square,2 x 2,4,False,,1,
3,,Red,True,Brick,Square,2 x 2,4,False,,1,
4,,White,True,Brick,Square,2 x 2,4,False,,1,
...,...,...,...,...,...,...,...,...,...,...,...
199,,Transparent orange,False,Plate,Square,1*1,0,False,0.0,1,
200,,Blck,False,Plate,Trapezium,2*2,4,False,0.0,1,
201,,Yellow,False,Plate,Rectangle,1*2,0,False,0.0,1,
202,,Transparent sky blue,False,Plate,Round,1*1,0,False,0.0,1,


### Data Cleaning
<div style="
    border: 2px solid orange;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [3]:
# removing transparent column
lego_dataset = lego_dataset.drop('transparent', axis=1)
lego_dataset

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
0,,Dark Blue,True,Brick,Rectangle,2 x 4,8,False,,1
1,,Green,True,Brick,Rectangle,2 x 4,8,False,,1
2,,Coral,True,Brick,Square,2 x 2,4,False,,1
3,,Red,True,Brick,Square,2 x 2,4,False,,1
4,,White,True,Brick,Square,2 x 2,4,False,,1
...,...,...,...,...,...,...,...,...,...,...
199,,Transparent orange,False,Plate,Square,1*1,0,False,0.0,1
200,,Blck,False,Plate,Trapezium,2*2,4,False,0.0,1
201,,Yellow,False,Plate,Rectangle,1*2,0,False,0.0,1
202,,Transparent sky blue,False,Plate,Round,1*1,0,False,0.0,1


In [4]:
# added Id's
lego_dataset['id'] = range(1, len(lego_dataset) + 1)
lego_dataset

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
0,1,Dark Blue,True,Brick,Rectangle,2 x 4,8,False,,1
1,2,Green,True,Brick,Rectangle,2 x 4,8,False,,1
2,3,Coral,True,Brick,Square,2 x 2,4,False,,1
3,4,Red,True,Brick,Square,2 x 2,4,False,,1
4,5,White,True,Brick,Square,2 x 2,4,False,,1
...,...,...,...,...,...,...,...,...,...,...
199,200,Transparent orange,False,Plate,Square,1*1,0,False,0.0,1
200,201,Blck,False,Plate,Trapezium,2*2,4,False,0.0,1
201,202,Yellow,False,Plate,Rectangle,1*2,0,False,0.0,1
202,203,Transparent sky blue,False,Plate,Round,1*1,0,False,0.0,1


In [5]:
# removing variations from base dimensions column using regular expression 
lego_dataset["base dimensions"] = lego_dataset["base dimensions"].str.replace(r'\s*([xX\+*])\s*', '*', regex=True)
lego_dataset.sample(20)

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
118,119,cream,no,plate,rectangle,2*1,2,no,0.0,1
175,176,Blue,False,Brick,Rectangle,2*6,12,False,0.0,1
59,60,lightpink,no,brick,rectangle,1*2,2,no,0.0,yes
104,105,bright yellow,no,plate,rectangle,2*1,2,no,0.0,1
65,66,black,no,plate,square,1*2,2,no,0.0,yes
15,16,Lilac,False,Brick,Square,1*2,2,True,45.0,1
128,129,blue,1,brick,rectangle,2*4,8,0,0.0,1
35,36,Olive Green,False,Plate,Rectangle,1*2,2,False,,1
172,173,White,False,Plate,Rectangle,2*4,8,False,0.0,1
32,33,Ice Blue,False,Brick,Rectangle,1*4,4,False,,1


In [8]:
# Converting values 
bool_map = {'true': True, 'yes': True, '1': True, 'false': False, 'no': False, '0': False}

cols = ["is duplo?", "has slope?"]
for col in cols:
    lego_dataset[col] = lego_dataset[col].astype(str).str.lower().map(bool_map)
# Slope degree
lego_dataset["slope degree"] = lego_dataset["slope degree"].replace(0, np.nan)

lego_dataset.head()

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
0,1,Dark Blue,True,Brick,Rectangle,2*4,8,False,,1
1,2,Green,True,Brick,Rectangle,2*4,8,False,,1
2,3,Coral,True,Brick,Square,2*2,4,False,,1
3,4,Red,True,Brick,Square,2*2,4,False,,1
4,5,White,True,Brick,Square,2*2,4,False,,1


In [9]:
# lower case 
columns = ["color", "size type", "base shape"]
for col in columns:
    lego_dataset[col] = lego_dataset[col].str.lower()
lego_dataset.tail()

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
199,200,transparent orange,False,plate,square,1*1,0,False,,1
200,201,blck,False,plate,trapezium,2*2,4,False,,1
201,202,yellow,False,plate,rectangle,1*2,0,False,,1
202,203,transparent sky blue,False,plate,round,1*1,0,False,,1
203,204,transparent yellow,False,plate,round,1*1,0,False,,1


In [10]:
lego_dataset.to_excel("Lego-Inventory-Creation-Clean.xlsx", index=False)