# Lego Analysis

Author: M. Tosic
Date: 01.2022
This notebook is part of my capstone project for a data science course.

## Import Libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
#pd.options.display.float_format = '{:,.2f}'.format

# import necessary libraries for batch import csv:
import os
import glob

## Import Data

In [23]:
df_sets = pd.read_csv('data/rebrickable-sets.csv')
df_themes = pd.read_csv('data/rebrickable-themes.csv')

In [24]:
def import_multiple_csv_files_2_df (relative_path):
    """ Function uses os and glob packages to import multiple csv files into one dataframe. 
    The current working directory should be the one where this notebook is located.
    INPUT: 
    Relative path to the files e.g. "./data/Kurac*.csv"
    OUTPUT: 
    One dataframe containting all csv files concatenated together over axis = 0.
    """
    path = os.getcwd()
    files = glob.glob(os.path.join(path, relative_path))
    
    print('Glob search with parameters:', relative_path)
   # print('Ingested files:')
    li = []
    for file in files:
        df_temp = pd.read_csv(file, index_col = None, header = 0)
        li.append(df_temp)
        #print(file)
    try:    
        df = pd.concat(li, axis=0, ignore_index=True)
        print('Done.')

    except:
        print('Something went wrong the concatenation of the files, returning None. Is the relative_path correctly set?')
        return(None)
    
    return (df)

In [25]:
df = import_multiple_csv_files_2_df("./data/Brickset*.csv")

Glob search with parameters: ./data/Brickset*.csv
Done.


In [27]:
#Droping unnessecary columns:
df.drop(['Qty owned', 'RRP (GBP)','UPC','RRP (EUR)','Qty owned new', 
         'Qty owned used', 'EAN','Priority','Wanted', 'Height', 'Depth', 'Weight', 'Width', 
         'Notes','Qty wanted','RRP (CAD)','Flag 1 not used', 'Flag 2 not used', 'Flag 3 not used',
         'Flag 4 not used', 'Flag 5 not used', 'Flag 6 not used','Flag 7 not used', 'Flag 8 not used'], axis=1, inplace=True)

In [28]:
#Adapting columns names to be able to user dot notation and more confortable coding (e.g. price instead of rrp)
df.rename(columns = lambda x : x.replace(' ', '_').replace('(','').replace(')','').lower().strip(), inplace = True)
df.rename(columns={'set':'set_name','rrp_usd': 'price_usd', 'value_new_usd': 'value_new', 'value_used_usd':'value_used'})

Unnamed: 0,number,theme,subtheme,year,set_name,minifigs,pieces,price_usd,value_new,value_used,launch_date,exit_date
0,10278-1,Creator Expert,Modular Buildings Collection,2021,Police Station,5.0,2923.0,199.99,175.03,166.22,01/01/2021,31/12/2024
1,10279-1,Creator Expert,Vehicles,2021,Volkswagen T2 Camper Van,,2207.0,179.99,177.20,,01/08/2021,31/12/2022
2,10280-1,Creator Expert,Botanical Collection,2021,Flower Bouquet,,756.0,49.99,54.81,45.00,01/01/2021,31/12/2024
3,10281-1,Creator Expert,Botanical Collection,2021,Bonsai Tree,,878.0,49.99,51.62,41.73,01/01/2021,31/12/2024
4,10282-1,Creator Expert,Adidas,2021,Adidas Originals Superstar,,731.0,79.99,88.45,,01/07/2021,31/12/2023
...,...,...,...,...,...,...,...,...,...,...,...,...
13889,LMG009-1,Promotional,Miscellaneous,2004,Snail,,8.0,,15.40,,,
13890,P1802-1,Gear,Pens,2004,Clikits Heart Pen,,,,,,,
13891,P3103-1,Gear,Pens,2004,Harry Potter Pen,,,,,,,
13892,P3112-1,Gear,Pens,2004,Santa Pen,,,,,,,


In [33]:
#Parse dates
df['launch_date'] = pd.to_datetime(df['launch_date'])
df['exit_date'] = pd.to_datetime(df['exit_date'])

In [36]:
#checking types per column
df.dtypes

number                    object
theme                     object
subtheme                  object
year                       int64
set_name                  object
minifigs                 float64
pieces                   float64
rrp_usd                  float64
value_new_usd            float64
value_used_usd           float64
launch_date       datetime64[ns]
exit_date         datetime64[ns]
dtype: object