# Aprendizaje Multietiqueta de Patrones Geométricos en Objetos de Herencia Cultural
# Labels normalization
## Seminario de Tesis II, Primavera 2022
### Master of Data Science. Universidad de Chile.
#### Prof. guía: Benjamín Bustos - Prof. coguía: Iván Sipirán
#### Autor: Matías Vergara\
El objetivo de este notebook es, dado un archivo con las etiquetas originales, generar un nuevo archivo correspondiente a la aplicación de lemmatization y stopword removal.

## Imports

In [10]:
root_dir = '../'    

In [11]:
import json
import csv
import pandas as pd
import numpy as np
from scipy import sparse
import nltk
from nltk.stem import WordNetLemmatizer
import os
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\m_jvs\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\m_jvs\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Configuración de datos

In [12]:
labels_df = pd.read_json(os.path.join(root_dir, "labels", "original_df.json"), orient='index')
patterns = labels_df.index
labels = list(labels_df.columns)

## Normalización

In [13]:
# Build a dict of {pattern: [related, labels]}

patterns_dict = {}
cols = labels_df.columns.values
mask = labels_df.gt(0.0).values
out = [cols[x].tolist() for x in mask]
assert(len(out) == len(patterns))
for i in range(0, len(out)):
  patterns_dict[patterns[i]] = out[i]

In [14]:
# Get a clean version of each label 

labels = []
for label in list(labels_df.columns): 
  label = label.replace('/', ' ')  # case of labels like "latticed/dotted/stripped"
  label = label.strip().lower() # remove leading and trailing whitespaces and lowerize 
  label = label.replace('"', "") # remove quotes
  label = ''.join([i for i in label if not i.isdigit()]) # remove numbers
  label = label.replace('(', "") # remove parenthesis
  label = label.replace(')', "")
  labels.append(label)

In [15]:
# Build a dict of {original label: clean label}

labels_dict = {}
assert(len(labels) == len(labels_df.columns))
for i in range(0, len(labels)):
  labels_dict[labels_df.columns[i]] = labels[i]

In [16]:
# Remove stop words
from gensim.parsing.preprocessing import remove_stopwords

lemmatizer = WordNetLemmatizer()

for i in range(0, len(labels)):
  label = labels[i]
  original_label = labels_df.columns[i]
  filtereds = []
  filtered = remove_stopwords(label)
  filtereds.append(filtered)
  if filtered!=label: 
    #print("Label:    {}\nFiltered: {}\n".format(label, filtered))
    if len(filtered) > 0:
      splitted = filtered.split()
      filtereds = splitted
  else:
    if label.find(" ") >= 0:
      filtereds = label.split()
    elif len(label) > 0:
      filtereds = [label]
  lemmatized = []
  for f in filtereds:
    lemma = lemmatizer.lemmatize(f)
    if f!=lemma:
      #print("Label:{}\nLemma:{}\n".format(f, lemma))
      lemmatized.append(lemma)
    else:
      lemmatized.append(f)
  labels_dict[original_label] = lemmatized

In [17]:
for k, v in patterns_dict.items():
  new_v = []
  for old_label in v:
    new_v += labels_dict[old_label]
  patterns_dict[k] = new_v

In [18]:
new_labels = []
for labels in patterns_dict.values():
  for label in labels:
    if label not in new_labels:
      new_labels.append(label)
print("Cantidad de etiquetas resultantes: {}".format(len(new_labels)))

Cantidad de etiquetas resultantes: 339


In [19]:
# Create an empty df with the appropiate index and col names (new labels)
pre_df = {}
for pattern in patterns_dict.keys():
  pre_df[pattern] = [0] * len(new_labels)

new_df = pd.DataFrame.from_dict(pre_df).T
new_df.columns = new_labels
display(new_df)

Unnamed: 0,pendent,teardrop,horizontal,panel,group,vertical,bar,floating,enclosing,shorter,...,light,body,bird,striped,worm,angular,raised,head,bird-seed,long
1a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96g,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Limpieza manual de casos puntuales

In [21]:
# Fill new df with 1's where appropiate
for k, v in patterns_dict.items():
  for label in v:
    new_df.at[k, label] =1 

bad_remaining_labels = ['', 'st.', 'fig.', 'b', 'b:', "v's", 'm', "s's", 's']

# problemas detectados:
# label vacio
# st.
# fig.
# b (letras solas)
# b:
# v's
# m
# s's
# s

new_df = new_df.drop(columns = bad_remaining_labels)

## Guardado 

In [59]:
# save normalized labels
new_df.to_json(os.path.join(root_dir, "labels", "normalized_df.json"), orient='index')