# Aprendizaje Multietiqueta de Patrones Geométricos en Objetos de Herencia Cultural
# Labels normalization
## Seminario de Tesis II, Primavera 2022
### Master of Data Science. Universidad de Chile.
#### Prof. guía: Benjamín Bustos - Prof. coguía: Iván Sipirán
#### Autor: Matías Vergara

In [13]:
# Imports
import json
import csv
import pandas as pd
import numpy as np
from scipy import sparse
import nltk
from nltk.stem import WordNetLemmatizer
import os
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\m_jvs\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\m_jvs\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [47]:
# Mounting google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = 'drive/MyDrive/TesisMV/'
except:
    root_dir = '../'

In [48]:
labels_df = pd.read_json(os.path.join(root_dir, "labels", "original_df.json"), orient='index')

In [49]:
patterns = labels_df.index
labels = list(labels_df.columns)

In [50]:
# Build a dict of {pattern: [related, labels]}

patterns_dict = {}
cols = labels_df.columns.values
mask = labels_df.gt(0.0).values
out = [cols[x].tolist() for x in mask]
assert(len(out) == len(patterns))
for i in range(0, len(out)):
  patterns_dict[patterns[i]] = out[i]

In [51]:
# Get a clean version of each label 

labels = []
for label in list(labels_df.columns): 
  label = label.replace('/', ' ')  # case of labels like "latticed/dotted/stripped"
  label = label.strip().lower() # remove leading and trailing whitespaces and lowerize 
  label = label.replace('"', "") # remove quotes
  label = ''.join([i for i in label if not i.isdigit()]) # remove numbers
  label = label.replace('(', "") # remove parenthesis
  label = label.replace(')', "")
  labels.append(label)

In [52]:
# Build a dict of {original label: clean label}

labels_dict = {}
assert(len(labels) == len(labels_df.columns))
for i in range(0, len(labels)):
  labels_dict[labels_df.columns[i]] = labels[i]

In [53]:
# Remove stop words
from gensim.parsing.preprocessing import remove_stopwords

lemmatizer = WordNetLemmatizer()

for i in range(0, len(labels)):
  label = labels[i]
  original_label = labels_df.columns[i]
  filtereds = []
  filtered = remove_stopwords(label)
  filtereds.append(filtered)
  if filtered!=label: 
    #print("Label:    {}\nFiltered: {}\n".format(label, filtered))
    if len(filtered) > 0:
      splitted = filtered.split()
      filtereds = splitted
  else:
    if label.find(" ") >= 0:
      filtereds = label.split()
    elif len(label) > 0:
      filtereds = [label]
  lemmatized = []
  for f in filtereds:
    lemma = lemmatizer.lemmatize(f)
    if f!=lemma:
      #print("Label:{}\nLemma:{}\n".format(f, lemma))
      lemmatized.append(lemma)
    else:
      lemmatized.append(f)
  labels_dict[original_label] = lemmatized

In [54]:
for k, v in patterns_dict.items():
  new_v = []
  for old_label in v:
    new_v += labels_dict[old_label]
  patterns_dict[k] = new_v

In [55]:
new_labels = []
for labels in patterns_dict.values():
  for label in labels:
    if label not in new_labels:
      new_labels.append(label)
print(len(new_labels))

339


In [56]:
# Create an empty df with the appropiate index and col names (new labels)
pre_df = {}
for pattern in patterns_dict.keys():
  pre_df[pattern] = [0] * len(new_labels)

new_df = pd.DataFrame.from_dict(pre_df).T
new_df.columns = new_labels
new_df

Unnamed: 0,pendent,teardrop,horizontal,panel,group,vertical,bar,floating,enclosing,shorter,...,light,body,bird,striped,worm,angular,raised,head,bird-seed,long
1a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96g,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# Fill new df with 1's where appropiate
for k, v in patterns_dict.items():
  for label in v:
    new_df.at[k, label] =1 

print("Version sin ajustes manuales:")

with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(new_df.loc[['34d']])

bad_remaining_labels = ['', 'st.', 'fig.', 'b', 'b:', "v's", 'm', "s's", 's']

# problemas detectados:
# label vacio
# st.
# latticed/dotted/stacked ?
# fig.
# b (letras solas)
# meandre esta mal encodeado
# hachurée esta mal encodeado
# b:
# net. -> net?
# v's
# m
# s's
# s

new_df = new_df.drop(columns = bad_remaining_labels)

print("Version final:")

with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(new_df.loc[['34d']])

Unnamed: 0,pendent,teardrop,horizontal,panel,group,vertical,bar,floating,enclosing,shorter,Unnamed: 11,line,alternately,framed,pannel,filling,ornament,alternating,simple,st.,andrew's,cross,chevron,inserted,crossing,composition,comb-like,pattern,free,design,metopal,diagonally,right,left,inclined,stack,oblique,half-lines,opposed,diagonal,hatched,triangle,interlocked,apex,large,small,stacked,field,latticing,separated,solid,cross-hatching,obliquely,disposed,wavy,band,double,triple,multiple,steep,wave,half,moon,single,hook,lightning,pair,parallel,rectangle,innermost,concentric,set,coffer,square,george's,dotted,quarter,cross-bar,cross-hatched,checkerboard,centre,cruciform,latticed,double-decker,metope,wolftooth,eight-pointed,star,hatching,simplified,hour-glass,neck,spandrel,differently,checkerboarded,saltire,shoulder,dot,doubled,horizontally,t-shaped,quatrefoil,central,circle,outline,stalk,reserved,background,quartered,lozenge,strong,surrounded,half-moon,double-halfmoon,hexafoil,octofoil,octofail,battlement,zigzag-filled,checked,interspaces,link,high,slim,rectilinear,attached,arrangement,frgt.,gear-pattern,formed,turning,meander,angle,chain,separate,fig.,reversed,inverted,b,enclosed,intertwined,quadruple,step,additional,downward,turn,detached,wind,swastika,connected,t-meander,beam,remaining,space,meander-like,z-shaped,continuous,repetition,standing,alternatively,meander-type,trident,sape,two-sided,'comb',type,mã©andre,hachurã©,panneau,tree,outlined,acute-angled,hooked,arm,two-tiered,net,careless,row,pointing,outwards,b:,spaced,slack,leaf,outside,three-tiered,metopes,four-tiered,triple-outline,quatered,side,cross-hathed,tangential,blob,linked,check,tapestry,net.,massed,like,underlying,schematized,lozenge-cross,upwards,arrow,overlapping,shape,intersecting,version,previous,upright,form,zigzag,border,form-square,schoulder,filled,intertwinded,linear,rhodian,root,stock,dogtooth,downwards,accompanied,antithetic,ray,opposite,direction,hour-glasses,ax,axe,superimposed,windmill,dissimilar,white,dark,massive,spacious,obtuse,metope-triglyph-frieze,e,cut,fringe,ear,corn,v-chevrons,v's,fishbone,facing,three-limbed,sigma,turned,four-limbed,m-chevrons,limbed,scribble,degeneration,column,variously,unframed,tadpole,dash,rectangular,rosette,flower,spiked,eight-armed,floor,stalked,sixteen-pointed,point,lightning-wheel,semicircle,three-quarter,disposition,confronted,quarter-circle,sound-waves,scale,losenges,eleven-pointed,forming,lined,branch,dif.,m,triple-line,maltese,three-winged,fan,wing,midrib,connecting,arc,near,rim,v-shaped,four-leaved,twelve-leaved,seven-leaved,four-spoked,wheel,circular,sunburst,flanking,tangent,tagential,tangets,elongated,crossed,concentrique,cable,doted,s's,spiral,inner,looped,s,running,half-circles,pothook,tongue,plant,figure,volute,palm-tree,fish,serpent,light,body,bird,striped,worm,angular,raised,head,bird-seed,long
34d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,pendent,teardrop,horizontal,panel,group,vertical,bar,floating,enclosing,shorter,line,alternately,framed,pannel,filling,ornament,alternating,simple,andrew's,cross,chevron,inserted,crossing,composition,comb-like,pattern,free,design,metopal,diagonally,right,left,inclined,stack,oblique,half-lines,opposed,diagonal,hatched,triangle,interlocked,apex,large,small,stacked,field,latticing,separated,solid,cross-hatching,obliquely,disposed,wavy,band,double,triple,multiple,steep,wave,half,moon,single,hook,lightning,pair,parallel,rectangle,innermost,concentric,set,coffer,square,george's,dotted,quarter,cross-bar,cross-hatched,checkerboard,centre,cruciform,latticed,double-decker,metope,wolftooth,eight-pointed,star,hatching,simplified,hour-glass,neck,spandrel,differently,checkerboarded,saltire,shoulder,dot,doubled,horizontally,t-shaped,quatrefoil,central,circle,outline,stalk,reserved,background,quartered,lozenge,strong,surrounded,half-moon,double-halfmoon,hexafoil,octofoil,octofail,battlement,zigzag-filled,checked,interspaces,link,high,slim,rectilinear,attached,arrangement,frgt.,gear-pattern,formed,turning,meander,angle,chain,separate,reversed,inverted,enclosed,intertwined,quadruple,step,additional,downward,turn,detached,wind,swastika,connected,t-meander,beam,remaining,space,meander-like,z-shaped,continuous,repetition,standing,alternatively,meander-type,trident,sape,two-sided,'comb',type,mã©andre,hachurã©,panneau,tree,outlined,acute-angled,hooked,arm,two-tiered,net,careless,row,pointing,outwards,spaced,slack,leaf,outside,three-tiered,metopes,four-tiered,triple-outline,quatered,side,cross-hathed,tangential,blob,linked,check,tapestry,net.,massed,like,underlying,schematized,lozenge-cross,upwards,arrow,overlapping,shape,intersecting,version,previous,upright,form,zigzag,border,form-square,schoulder,filled,intertwinded,linear,rhodian,root,stock,dogtooth,downwards,accompanied,antithetic,ray,opposite,direction,hour-glasses,ax,axe,superimposed,windmill,dissimilar,white,dark,massive,spacious,obtuse,metope-triglyph-frieze,e,cut,fringe,ear,corn,v-chevrons,fishbone,facing,three-limbed,sigma,turned,four-limbed,m-chevrons,limbed,scribble,degeneration,column,variously,unframed,tadpole,dash,rectangular,rosette,flower,spiked,eight-armed,floor,stalked,sixteen-pointed,point,lightning-wheel,semicircle,three-quarter,disposition,confronted,quarter-circle,sound-waves,scale,losenges,eleven-pointed,forming,lined,branch,dif.,triple-line,maltese,three-winged,fan,wing,midrib,connecting,arc,near,rim,v-shaped,four-leaved,twelve-leaved,seven-leaved,four-spoked,wheel,circular,sunburst,flanking,tangent,tagential,tangets,elongated,crossed,concentrique,cable,doted,spiral,inner,looped,running,half-circles,pothook,tongue,plant,figure,volute,palm-tree,fish,serpent,light,body,bird,striped,worm,angular,raised,head,bird-seed,long
34d,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [59]:
# save normalized labels
new_df.to_json(os.path.join(root_dir, "labels", "normalized_df.json"), orient='index')