# Aprendizaje Multietiqueta de Patrones Geométricos en Objetos de Herencia Cultural
# Labels normalization
## Seminario de Tesis II, Primavera 2022
### Master of Data Science. Universidad de Chile.
#### Prof. guía: Benjamín Bustos - Prof. coguía: Iván Sipirán
#### Autor: Matías Vergara

In [None]:
# Imports
import json
import csv
import pandas as pd
import numpy as np
from scipy import sparse
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
# Mounting google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    folder_path = 'drive/MyDrive/TesisMV/labels/'
except:
    folder_path = '../labels/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
labels_df = pd.read_json(folder_path + "original_df.json", orient='index')

In [None]:
patterns = labels_df.index
labels = list(labels_df.columns)

In [None]:
# Build a dict of {pattern: [related, labels]}

patterns_dict = {}
cols = labels_df.columns.values
mask = labels_df.gt(0.0).values
out = [cols[x].tolist() for x in mask]
assert(len(out) == len(patterns))
for i in range(0, len(out)):
  patterns_dict[patterns[i]] = out[i]

In [None]:
# Get a clean version of each label 

labels = []
for label in list(labels_df.columns): #TO-DO: replace processing by reg exp
  label = label.strip().lower() # remove leading and trailing whitespaces and lowerize 
  label = label.replace('"', "") # remove quotes
  label = ''.join([i for i in label if not i.isdigit()]) # remove numbers
  label = label.replace('(', "") # remove parenthesis
  label = label.replace(')', "")
  labels.append(label)

In [None]:
# Build a dict of {original label: clean label}

labels_dict = {}
assert(len(labels) == len(labels_df.columns))
for i in range(0, len(labels)):
  labels_dict[labels_df.columns[i]] = labels[i]

In [None]:
# Remove stop words
from gensim.parsing.preprocessing import remove_stopwords

lemmatizer = WordNetLemmatizer()

for i in range(0, len(labels)):
  label = labels[i]
  original_label = labels_df.columns[i]
  filtereds = []
  filtered = remove_stopwords(label)
  filtereds.append(filtered)
  if filtered!=label: 
    #print("Label:    {}\nFiltered: {}\n".format(label, filtered))
    if len(filtered) > 0:
      splitted = filtered.split()
      filtereds = splitted
  else:
    if label.find(" ") >= 0:
      filtereds = label.split()
    elif len(label) > 0:
      filtereds = [label]
  lemmatized = []
  for f in filtereds:
    lemma = lemmatizer.lemmatize(f)
    if f!=lemma:
      #print("Label:{}\nLemma:{}\n".format(f, lemma))
      lemmatized.append(lemma)
    else:
      lemmatized.append(f)
  labels_dict[original_label] = lemmatized

In [None]:
for k, v in patterns_dict.items():
  new_v = []
  for old_label in v:
    new_v += labels_dict[old_label]
  patterns_dict[k] = new_v

In [None]:
new_labels = []
for labels in patterns_dict.values():
  for label in labels:
    if label not in new_labels:
      new_labels.append(label)
print(len(new_labels))

340


In [None]:
# Create an empty df with the appropiate index and col names (new labels)
pre_df = {}
for pattern in patterns_dict.keys():
  pre_df[pattern] = [0] * len(new_labels)

new_df = pd.DataFrame.from_dict(pre_df).T
new_df.columns = new_labels
new_df

Unnamed: 0,pendent,teardrop,horizontal,panel,group,vertical,bar,floating,enclosing,shorter,...,light,body,bird,striped,worm,angular,raised,head,bird-seed,long
1a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96g,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96h,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Fill new df with 1's where appropiate
for k, v in patterns_dict.items():
  for label in v:
    new_df.at[k, label] =1 

new_df

Unnamed: 0,pendent,teardrop,horizontal,panel,group,vertical,bar,floating,enclosing,shorter,...,light,body,bird,striped,worm,angular,raised,head,bird-seed,long
1a,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1b,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1c,0,0,1,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1d,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1e,0,0,1,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96e,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
96f,0,0,0,1,0,1,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
96g,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
96h,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [None]:
# save normalized labels
new_df.to_json(folder_path + "normalized_df.json", orient='index')