# Aprendizaje Multietiqueta de Patrones Geométricos en Objetos de Herencia Cultural
# Export DATA
## Seminario de Tesis II, Primavera 2022 
### Master of Data Science, Universidad de Chile.
#### Supervisor: Prof. Benjamín Bustos, Prof. Iván Sipirán
#### Author: Matías Vergara

### References:
- [SLEEC Homepage](http://manikvarma.org/code/SLEEC/download.html)
- [SLEEC Paper: Sparse Local Embeddings for Extreme Multi-label Classification](https://papers.nips.cc/paper/2015/hash/35051070e572e47d2c26c241ab88307f-Abstract.html)
- [The Emerging Trends of Multi-Label Learning](https://arxiv.org/abs/2011.11197)
- [GitHub: C2AE Multilabel Classification](https://github.com/dhruvramani/C2AE-Multilabel-Classification)
- 'Learning Deep Latent Spaces for Multi-Label Classfications' published in AAAI 2017

Este notebook tiene por finalidad exportar un conjunto de datos (a seleccionar con las celdas de selección habituales) al formato requerido por implementaciones oficiales de los modelos presentados en el paper de Emerging Trends of Multi-Label Learning. Hasta el momento se ha experimentado con:
- SLEEC, sin mayor éxito (requiere una instalación particular de MeX que viene con Matlab 2017b, al cual no logré acceder)
- C2AE, con resultados mediocres (no hay aprendizaje).


## Mounting Google Drive

In [2]:
# Mounting google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    folder_path = 'drive/MyDrive/TesisMV/'
except:
    folder_path = '../'

## Imports

In [3]:
from IPython.display import display
import os
import math
import random
import shutil

# Data treatment
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.io import savemat
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset and model selection

In [4]:
LABELS_IN_STUDY = 25 # top N labels will be exported to Matlab

In [5]:
USE_RN50 = False
SUBCHAPTERS = False
DS_FLAGS = ['base']
    # 'ref': [invertX, invertY],
    # 'rot': [rotate90, rotate180, rotate270],
    # 'crop': [crop] * CROP_TIMES,
    # 'blur': [blur],
    # 'emboss': [emboss],
    # 'randaug': [randaug],
    # 'rain': [rain],
    # 'elastic': [elastic]
CROP_TIMES = 1
RANDOM_TIMES = 1
ELASTIC_TIMES = 1

In [6]:
# This cells builds the data_flags variable, that will be used
# to map the requested data treatment to folders
MAP_TIMES = {'crop': CROP_TIMES,
         'randaug': RANDOM_TIMES,
         'elastic': ELASTIC_TIMES,
}

DS_FLAGS = sorted(DS_FLAGS)
data_flags = '_'.join(DS_FLAGS) if len(DS_FLAGS) > 0 else 'base'
MULTIPLE_TRANSF = ['crop', 'randaug', 'elastic']
COPY_FLAGS = DS_FLAGS.copy()

for t in MULTIPLE_TRANSF:
    if t in DS_FLAGS:
        COPY_FLAGS.remove(t)
        COPY_FLAGS.append(t + str(MAP_TIMES[t]))
        data_flags = '_'.join(COPY_FLAGS)

subchapter_str = 'subchapters/' if SUBCHAPTERS else ''
patterns_path = folder_path + 'patterns/' + subchapter_str + data_flags + "/"
labels_path = folder_path + 'labels/' +  subchapter_str + data_flags + "/"
data_flags = f'resnet50_{data_flags}' if USE_RN50 else f'resnet18_{data_flags}'
features_path = folder_path + "features/" + subchapter_str + data_flags + '/'

#rn = 18
#ep = 65
#labels_path = folder_path + 'labels/' +  subchapter_str + data_flags + "/"
#data_flags = f'resnet50_{data_flags}_e{ep}' if USE_RN50 else f'resnet18_{data_flags}_e{ep}'
#features_path = folder_path + f"features/resnet{rn}_blur_each5/resnet{rn}_blur_e{ep}/"

print(features_path)
print(labels_path)
if not (os.path.isdir(features_path) and os.path.isdir(labels_path)):
    raise FileNotFoundError("No existen directorios de datos para el conjunto de flags seleccionado. Verifique que el dataset exista y, de lo contrario, llame a Split and Augmentation {}".format(
        (os.path.isdir(features_path), os.path.isdir(labels_path))))
print("Features set encontrado en {}".format(features_path))
print("Labels set encontrado en {}".format(labels_path))

../features/resnet18_base/
../labels/base/
Features set encontrado en ../features/resnet18_base/
Labels set encontrado en ../labels/base/


In [7]:
train_filename = "augmented_train_df.json"
val_filename = "val_df.json"
test_filename = "test_df.json"

## Exporting Data

In [8]:
features_train = pd.read_json(features_path + train_filename, orient='index').sort_index()
features_val = pd.read_json(features_path + val_filename, orient='index').sort_index()
features_test = pd.read_json(features_path + test_filename, orient='index').sort_index()

labels_train = pd.read_json(labels_path + train_filename, orient='index').sort_index()
labels_val = pd.read_json(labels_path + val_filename, orient='index').sort_index()
labels_test = pd.read_json(labels_path + test_filename, orient='index').sort_index()

In [9]:
features_test_val = pd.DataFrame.append(features_test, features_val )
labels_test_val = pd.DataFrame.append(labels_test, labels_val)

  features_test_val = pd.DataFrame.append(features_test, features_val )
  labels_test_val = pd.DataFrame.append(labels_test, labels_val)


In [10]:
def filter_labels(labels_df, freq, number_labels = None):
  """Filters a label dataframe based on labels frequency (number of events)

    Parameters:
    labels_df (DataFrame): dataframe of labels
    freq (int): threshold frequency. Labels with a lower value will be filtered.

    Returns:
    DataFrame: filtered labels dataframe

  """
  top_labels = None

  if not number_labels:
    filtered_df = labels_df.loc[:, labels_df.sum(axis=0) > freq]
    top_labels = filtered_df.sum().sort_values(ascending=False)
    return top_labels, 0

  if number_labels:
      filtered_labels = 0
      pivot = freq
      while filtered_labels != number_labels:
              filtered_df = labels_df.loc[:, labels_df.sum(axis=0) > pivot]
              top_labels = filtered_df.sum().sort_values(ascending=False)
              print(pivot, len(top_labels.values))
              if len(top_labels.values) > number_labels:
                  pivot += 1
              elif len(top_labels.values) < number_labels:
                  pivot -= 1
              else:
                  print("Aplicando threshold {} para trabajar con {} labels".format(pivot, len(top_labels.values)))
                  return top_labels, pivot

def filter_dfs(df, top_labels_df):
  df = df[df.columns.intersection(top_labels_df.index)]
  return df

def combine_dfs(labels_df, top_labels_df, features_df):
  """Combine labels dataframe with features dataframe based on index (patterns names)
     keeping only the most frequent labels.

    Parameters:
    labels_df (DataFrame): dataframe of labels, with patterns name as index
    top_labels_df (DataFrame): a 1D dataframe with the name of the most freq. labels, as
    the outcome of filter_labels() function
    features_df (DataFrame): dataframe of features, with patterns name as index

    Returns:
    DataFrame: combined labels + features dataframe, merged on index

  """
  assert len(labels_df) == len(features_df)
  labels_df = labels_df[labels_df.columns.intersection(top_labels_df.index)]
  final_df = pd.merge(labels_df,
                      features_df,
                      left_index=True, right_index=True)
  return final_df

def split_data(final_df, top_labels_df, test_size):
  """Splits the data in train and test.

    Parameters:
    final_df (DataFrame): outcome of combine_dfs.
    top_labels_df (DataFrame): dataframe of most freq. labels. Necessary to
    know at which column the labels (Y) ends and the features (X) starts
    freq (int): threshold frequency. Labels with a lower value will be filtered.
    test_size (float): proportion test/(test+train).

    Returns:
    (np.array, np.array, np.array, np.array): X train, X test, Y train, Y test

  """
  X = np.array(final_df.iloc[:, len(final_df):], dtype=float)
  Y = np.array(final_df.iloc[:, 0:len(final_df)], dtype=float)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,random_state=42)
  return X_train, X_test, Y_train, Y_test

In [11]:
top_labels_df, pivot = filter_labels(labels_train, 20, LABELS_IN_STUDY) # for example
X_train = features_train
X_test = features_test_val   # labels_test_val since val examples are unrecognized to multilabel algorithms
Y_train = filter_dfs(labels_train, top_labels_df) # reduce labels to most freq
Y_test = filter_dfs(labels_test_val, top_labels_df) # in both train and test

20 31
21 31
22 29
23 28
24 28
25 25
Aplicando threshold 25 para trabajar con 25 labels


In [12]:
X = X_train.to_numpy()
Xt = X_test.to_numpy()
Y = Y_train.to_numpy()
Yt = Y_test.to_numpy()
print(X.shape)
print(Y.shape)
print(Xt.shape)
print(Yt.shape)
# TO SLEEC
#savemat(f"{data_flags}.mat", dict(X=X, Xt=Xt, Y=Y, Yt=Yt))

(542, 512)
(542, 25)
(234, 512)
(234, 25)


In [17]:
# TO C2AE
import pickle
with open('kunisch-train-features.pkl', 'wb') as f: pickle.dump(X, f)
with open('kunisch-train-labels.pkl', 'wb') as f: pickle.dump(Y, f)
with open('kunisch-test-features.pkl', 'wb') as f: pickle.dump(Xt, f)
with open('kunisch-test-labels.pkl', 'wb') as f: pickle.dump(Yt, f)

### Results

|    **Modelo**   	| **Tradicional** 	| **C2AE** 	|
|:---------------:	|:---------------:	|:--------:	|
| Precision Micro 	|       0.08      	|   0.14   	|
| Precision Macro 	|       0.08      	|   0.07   	|
|   Recall Micro  	|       0.62      	|   0.65   	|
|   Recall Macro  	|       0.41      	|   0.31   	|
|     F1 Micro    	|       0.15      	|   0.24   	|
|     F1 Macro    	|       0.12      	|   0.11   	|