<a href="https://colab.research.google.com/github/mokshiz/lego-plotly-demo/blob/main/plotly_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
#dataset updated on Oct. 20, 2022, 6:04 p.m. from https://rebrickable.com/downloads/
#references: Plotly documentation - https://plotly.com/python/
#code from Mokshita Gupta

import csv
import plotly.express as px
import pandas as pd
import random

#analyze how the production changed from 1950-2022
#is there any pattern in the number of parts per set over the years?

acc = {}

def set_and_parts(file):
  # data structure = {year: (sets, parts)}
  
  with open(file,'r') as f:
    reader = csv.reader(f)
    next(reader, None) #skip header
    for row in reader:
      #if the year is already in dict
      if row[2] in acc:
        #add one more set that made in this year
        acc[row[2]][0] +=1
        #add how many pieces that set has
        acc[row[2]][1] += int(row[4])
      else:
        # year not in dict
        acc[row[2]] = [1, int(row[4])]
    
  avgs = {}
  #calculate and accumulate average number of parts per set per year
  for year,(sets,parts) in acc.items():
    avgs[int(year)] = parts/sets

  #convert to dataframe for easy plotting!
  data_p = pd.DataFrame({'Year':list(avgs.keys()),
                         'Avg. parts per set':list(avgs.values())
                         })
  
  #plot
  plt = px.scatter(data_frame=data_p, x='Year', y='Avg. parts per set', 
                   title="Average parts per set from 1950-2022")
  plt.show()

  #print the pearson correlation matrix
  print(data_p.corr(method='pearson'))

set_and_parts("sets.csv")

                        Year  Avg. parts per set
Year                1.000000            0.809165
Avg. parts per set  0.809165            1.000000


In [58]:
#how many sets of a particular theme?

#Comparator function
def take_second(elem):
    return elem[1]


def set_themes(file1,file2, how_many):
  acc = {} #dict for mapping theme_ids -> number of sets
  lookup = {} #dict for mapping theme_ids -> theme names
  li = [] #accumulating list to sort

  with open(file1,'r') as f:
    reader1 = csv.reader(f)
    next(reader1, None) #skip header
    for row in reader1:
      if row[0] in lookup:
        #id is already discovered
        continue
      else:
        #add theme_id to the dict with the theme name
        lookup[row[0]] = row[1]

  with open(file2,'r') as f:
    reader = csv.reader(f)
    next(reader, None) #skip header
    for row in reader:
      #this is to prevent looking up out of range IDs
      if int(row[3]) > int(max(lookup.keys())):
        continue
      #count the number of sets per theme id
      if row[3] in acc:
        acc[row[3]] += 1
      else:
        acc[row[3]] = 1

  #dump dict values in a list for easy sorting
  # datastructure = list of tuples (partID, number)
  for k,v in acc.items():
    li.append((k,v))


  #sort descending using comparator
  li.sort(key=take_second,reverse=True)


  sorted_li = li[:50]

  #top x most occuring themes
  li = li[:how_many]
  
  #shuffling for bubble chart
  random.shuffle(li)
  
  
  new = {} #final dict mapping theme name -> number of sets
  for id in li.copy():
    var = lookup[id[0]]
    if var in new:
      li.remove(id)
      continue
    new[var] = acc[id[0]]

  #plotting code
  xl = []
  yl = []
  sorted_li.sort(key=take_second,reverse=True)
  for id in sorted_li:
    var = lookup[id[0]]
    if id[0] in lookup:
      yl.append(id[1])
      xl.append(lookup[id[0]])

  data_frame = pd.DataFrame({'Theme Name':list(new.keys()),'Number of sets':list(new.values())})
  fig = px.scatter(data_frame, x="Theme Name", y="Number of sets", size="Number of sets", color="Theme Name",
           hover_name="Theme Name", size_max=100,
           title=f"{how_many} Most popular set themes")
  fig.show()

  data_frame2 = pd.DataFrame({'Theme Name':xl,'Number of sets':yl})
  fig2 = px.bar(data_frame2, x="Theme Name", y="Number of sets",
                title="50 Most popular set themes")

  fig2.show()

set_themes("themes.csv","sets.csv",20)

In [59]:



# make a heeatmap of how many times a popular part appears in a set
# excercise: how many times do popular parts appear in a popular themes - do popular themes use more rare parts than random sets?

def gen_heatmap(file, how_many):
  
  another = {}
  li = []
  data = {}
  with open(file,'r') as f:
    reader = csv.reader(f)
    next(reader, None) #skip header
    for row in reader:
      # make a dictionary of how many times a part occurs 
      # in datastructure {partID: number}
      if row[1] in another:
        another[row[1]] += int(row[3])
      else:
        another[row[1]] = int(row[3])

      # make a dictionary of how many times a part occurs in a particular set
      # datastructure = {setid: {partid:number}}
      if row[0] in data:
        if row[1] in data[row[0]]:
          data[row[0]][row[1]] += int(row[3])
          pass
        else:
          data[row[0]][row[1]] = int(row[3])
      else:
        data[row[0]] = {row[1]:int(row[3])}
    
    #dump dict values in a list for easy sorting
    # datastructure = list of tuples (partID, number)
    for k,v in another.items():
      li.append((k,v))

    #sort descending using comparator
    li.sort(key=take_second,reverse=True)

    #top x most occuring parts
    li = li[:how_many]

    # generate random sample of lego setIDs
    list_of_random_sets = random.sample(list(data), how_many) 

    #accumulate a 2d list of how many times the top x parts occur in a particular set
    data_to_graph = []
    for s in list_of_random_sets:
      a = []
      lookup = data[s]
      for part in li:
        if part[0] in lookup:
          a.append(lookup[part[0]])
        else:
          a.append(0)
      data_to_graph.append(a)

  #plot a heatmap
  fig = px.imshow(data_to_graph, x = [x[0] for x in li], y=list_of_random_sets,
                  labels=dict(x="Part ID", y="Set ID", color="Frequency of the part in the set"))
  fig.show()


gen_heatmap("inventory_parts.csv",10)
