# Lego Inventory Creation Dataset
<div style="
    border: 5px solid purple;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [1]:
import pandas as pd
import numpy as np

### Importing data
<div style="
    border: 2px solid orange;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [2]:
url = "https://docs.google.com/spreadsheets/d/17o2TJJ3_pmrFsFNIdhxyPW3PFO0zyksoSVbrWzrDJoU/export?format=xlsx"
dataset = pd.ExcelFile(url, engine='openpyxl')

# Parse all sheets into a dictionary
sheets = {sheet: dataset.parse(sheet) for sheet in dataset.sheet_names}

# Merge all sheets into one DataFrame
lego_dataset = pd.concat(sheets.values(), ignore_index=True)

# Display info
lego_dataset

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock,transparent
0,,Dark Blue,True,Brick,Rectangle,2 x 4,8,False,,1,
1,,Green,True,Brick,Rectangle,2 x 4,8,False,,1,
2,,Coral,True,Brick,Square,2 x 2,4,False,,1,
3,,Red,True,Brick,Square,2 x 2,4,False,,1,
4,,White,True,Brick,Square,2 x 2,4,False,,1,
...,...,...,...,...,...,...,...,...,...,...,...
199,,Transparent orange,False,Plate,Square,1*1,0,False,0.0,1,
200,,Blck,False,Plate,Trapezium,2*2,4,False,0.0,1,
201,,Yellow,False,Plate,Rectangle,1*2,0,False,0.0,1,
202,,Transparent sky blue,False,Plate,Round,1*1,0,False,0.0,1,


### Data Cleaning
<div style="
    border: 2px solid orange;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [3]:
# removing transparent column
lego_dataset = lego_dataset.drop('transparent', axis=1)
lego_dataset

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
0,,Dark Blue,True,Brick,Rectangle,2 x 4,8,False,,1
1,,Green,True,Brick,Rectangle,2 x 4,8,False,,1
2,,Coral,True,Brick,Square,2 x 2,4,False,,1
3,,Red,True,Brick,Square,2 x 2,4,False,,1
4,,White,True,Brick,Square,2 x 2,4,False,,1
...,...,...,...,...,...,...,...,...,...,...
199,,Transparent orange,False,Plate,Square,1*1,0,False,0.0,1
200,,Blck,False,Plate,Trapezium,2*2,4,False,0.0,1
201,,Yellow,False,Plate,Rectangle,1*2,0,False,0.0,1
202,,Transparent sky blue,False,Plate,Round,1*1,0,False,0.0,1


In [4]:
# added Id's
lego_dataset['id'] = range(1, len(lego_dataset) + 1)
lego_dataset

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
0,1,Dark Blue,True,Brick,Rectangle,2 x 4,8,False,,1
1,2,Green,True,Brick,Rectangle,2 x 4,8,False,,1
2,3,Coral,True,Brick,Square,2 x 2,4,False,,1
3,4,Red,True,Brick,Square,2 x 2,4,False,,1
4,5,White,True,Brick,Square,2 x 2,4,False,,1
...,...,...,...,...,...,...,...,...,...,...
199,200,Transparent orange,False,Plate,Square,1*1,0,False,0.0,1
200,201,Blck,False,Plate,Trapezium,2*2,4,False,0.0,1
201,202,Yellow,False,Plate,Rectangle,1*2,0,False,0.0,1
202,203,Transparent sky blue,False,Plate,Round,1*1,0,False,0.0,1


In [5]:
# removing variations from base dimensions column using regular expression 
lego_dataset["base dimensions"] = lego_dataset["base dimensions"].str.replace(r'\s*([xX\+*])\s*', '*', regex=True)
lego_dataset.sample(20)

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
107,108,white,no,plate,circle,4*4,12,no,0.0,1
176,177,Black,False,Plate,Square,2*2,2,False,0.0,1
2,3,Coral,True,Brick,Square,2*2,4,False,,1
60,61,white,no,plate,rectangle,2*8,16,no,0.0,yes
168,169,Light Green,False,Plate,Rectangle,2*8,16,False,0.0,1
158,159,hot magent,0,tile,circle,1*1,0,0,0.0,1
161,162,white,0,brick,square,2*2,4,1,45.0,1
44,45,lightpink,yes,brick,square,2*2,4,no,0.0,yes
123,124,coral,1,plate,rectangle,2*4,8,0,0.0,1
31,32,Denim Blue,False,Plate,Rectangle,0*0,0,False,,1


In [7]:
# Converting values 
bool_map = {'true': True, 'yes': True, '1': True, 'false': False, 'no': False, '0': False}

cols = ["is duplo?", "has slope?"]
for col in cols:
    lego_dataset[col] = lego_dataset[col].astype(str).str.lower().map(bool_map)
# Slope degree
lego_dataset["slope degree"] = lego_dataset["slope degree"].replace(0, np.nan)
# In stock
lego_dataset["in stock"] = lego_dataset["in stock"].replace('yes', 1)

lego_dataset.sample(20)

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
60,61,white,False,plate,rectangle,2*8,16,False,,1
9,10,Yellow,False,Brick,Rectangle,2*4,8,False,,1
108,109,black,False,plate,square,2*2,2,False,,1
15,16,Lilac,False,Brick,Square,1*2,2,True,45.0,1
62,63,grey,False,plate,triangle,1*4,4,True,15.0,1
111,112,navy blue,False,plate,trapezoid,4*2,4,False,,1
187,188,Light Green,False,Brick,Square,1*2,2,True,45.0,1
197,198,Transparent yellow,False,Plate,Rectangle,1*2,2,False,,1
52,53,darkpink,False,brick,rectangle,1*4,4,False,,1
72,73,darkblue,False,plate,square,1*1,1,False,,1


In [8]:
# lower case 
columns = ["color", "size type", "base shape"]
for col in columns:
    lego_dataset[col] = lego_dataset[col].str.lower()
lego_dataset.tail()

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
199,200,transparent orange,False,plate,square,1*1,0,False,,1
200,201,blck,False,plate,trapezium,2*2,4,False,,1
201,202,yellow,False,plate,rectangle,1*2,0,False,,1
202,203,transparent sky blue,False,plate,round,1*1,0,False,,1
203,204,transparent yellow,False,plate,round,1*1,0,False,,1


In [9]:
lego_dataset.to_excel("Lego-Inventory-Creation-Clean.xlsx", index=False)

### Dataset Overview
<div style="
    border: 2px solid orange;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [10]:
# shape (rows, columns)
lego_dataset.shape
print(f"Dataset consist of {lego_dataset.shape[0]} rows and {lego_dataset.shape[1]} columns.")

Dataset consist of 204 rows and 10 columns.


### Dataset Structure
<div style="
    border: 2px solid orange;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [11]:
lego_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               204 non-null    int64  
 1   color            204 non-null    object 
 2   is duplo?        204 non-null    bool   
 3   size type        204 non-null    object 
 4   base shape       204 non-null    object 
 5   base dimensions  204 non-null    object 
 6   number of studs  204 non-null    int64  
 7   has slope?       204 non-null    bool   
 8   slope degree     24 non-null     float64
 9   in stock         204 non-null    int64  
dtypes: bool(2), float64(1), int64(3), object(4)
memory usage: 13.3+ KB


### Discriptive Statistics
<div style="
    border: 2px solid orange;
    border-radius: 8px;
    padding: 0px;
    margin: 10px 0;
    background-color: inherit;
    color: inherit;
">
</div>

In [12]:
# Numeric Columns
lego_dataset.describe()

Unnamed: 0,id,number of studs,slope degree,in stock
count,204.0,204.0,24.0,204.0
mean,102.5,4.906863,43.125,1.0
std,59.033889,4.996171,6.726408,0.0
min,1.0,0.0,15.0,1.0
25%,51.75,2.0,45.0,1.0
50%,102.5,4.0,45.0,1.0
75%,153.25,6.0,45.0,1.0
max,204.0,24.0,45.0,1.0


In [13]:
#getting statistics on categorical/object columns
obj_cols = lego_dataset.select_dtypes(include='object').columns
print(f"This dataset has {len(obj_cols)} object columns:", list(obj_cols))

This dataset has 4 object columns: ['color', 'size type', 'base shape', 'base dimensions']


In [14]:
lego_dataset.sample(20)

Unnamed: 0,id,color,is duplo?,size type,base shape,base dimensions,number of studs,has slope?,slope degree,in stock
73,74,darkpink,False,plate,square,1*1,1,False,,1
74,75,neon orange,False,plate,rectangle,1*2,2,False,,1
71,72,yellow,False,plate,square,2*2,4,False,,1
14,15,purple,False,plate,rectangle,4*6,24,False,,1
64,65,white,False,plate,rectangle,2*3,6,False,,1
33,34,capri,False,brick,rectangle,1*2,2,False,,1
42,43,lightbrown,True,plate,rectangle,2*8,16,False,,1
81,82,black,False,brick,square,1*1,1,True,45.0,1
147,148,blue,False,plate,trapezoid,2*2,4,False,,1
102,103,yellow,False,plate,rectangle,3*2,6,False,,1


In [16]:
#detailed statistics on object data
for col in lego_dataset.select_dtypes(include='object'):
    print(f"\nColumn: {col}")
    print("Count of non-null values:", lego_dataset[col].count()) 
    print("Number of unique values:", lego_dataset[col].nunique())
    print("Most frequent value:", lego_dataset[col].value_counts().idxmax())
    print("Most frequent value (frequency):", lego_dataset[col].value_counts().max())
    print("Least frequent value:", lego_dataset[col].value_counts().idxmin())
    print("Least frequent value (frequency):", lego_dataset[col].value_counts().min())


Column: color
Count of non-null values: 204
Number of unique values: 68
Most frequent value: yellow
Most frequent value (frequency): 16
Least frequent value: lime green
Least frequent value (frequency): 1

Column: size type
Count of non-null values: 204
Number of unique values: 3
Most frequent value: plate
Most frequent value (frequency): 104
Least frequent value: tile
Least frequent value (frequency): 4

Column: base shape
Count of non-null values: 204
Number of unique values: 8
Most frequent value: rectangle
Most frequent value (frequency): 109
Least frequent value: wadge
Least frequent value (frequency): 1

Column: base dimensions
Count of non-null values: 204
Number of unique values: 22
Most frequent value: 2*2
Most frequent value (frequency): 47
Least frequent value: 6*2
Least frequent value (frequency): 1
