# Aprendizaje Multietiqueta de Patrones Geométricos en Objetos de Herencia Cultural
# Export DATA
## Seminario de Tesis II, Primavera 2022 
### Master of Data Science, Universidad de Chile.
#### Supervisor: Prof. Benjamín Bustos, Prof. Iván Sipirán
#### Author: Matías Vergara

### References:
- [SLEEC Homepage](http://manikvarma.org/code/SLEEC/download.html)
- [SLEEC Paper: Sparse Local Embeddings for Extreme Multi-label Classification](https://papers.nips.cc/paper/2015/hash/35051070e572e47d2c26c241ab88307f-Abstract.html)
- [The Emerging Trends of Multi-Label Learning](https://arxiv.org/abs/2011.11197)
- [GitHub: C2AE Multilabel Classification](https://github.com/dhruvramani/C2AE-Multilabel-Classification)
- 'Learning Deep Latent Spaces for Multi-Label Classfications' published in AAAI 2017

Este notebook tiene por finalidad exportar un conjunto de datos (a seleccionar con las celdas de selección habituales) al formato requerido por implementaciones oficiales de los modelos presentados en el paper de Emerging Trends of Multi-Label Learning. Hasta el momento se ha experimentado con:
- SLEEC, sin mayor éxito (requiere una instalación particular de MeX que viene con Matlab 2017b, al cual no logré acceder)
- C2AE, con resultados mediocres.


## Mounting Google Drive

In [1]:
# Mounting google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = 'drive/MyDrive/TesisMV/'
except:
    root_dir = '..'

## Imports

In [2]:
from IPython.display import display
import os
import math
import random
import shutil
import pickle

# Data treatment
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.io import savemat
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from utils import KunischPruner
from utils import KunischMetrics

## Dataset and model selection

In [3]:
LABELS_IN_STUDY = 26 # top N labels will be exported to Matlab
K = 0

In [5]:
USE_RN50 = False
SUBCHAPTERS = False
DS_FLAGS = []
    # 'ref': [invertX, invertY],
    # 'rot': [rotate90, rotate180, rotate270],
    # 'crop': [crop] * CROP_TIMES,
    # 'blur': [blur],
    # 'emboss': [emboss],
    # 'randaug': [randaug],
    # 'rain': [rain],
    # 'elastic': [elastic]
CROP_TIMES = 1
RANDOM_TIMES = 1
ELASTIC_TIMES = 1

In [6]:
# This cells builds the data_flags variable, that will be used
# to map the requested data treatment to folders
MAP_TIMES = {'crop': CROP_TIMES,
         'randaug': RANDOM_TIMES,
         'elastic': ELASTIC_TIMES,
}

DS_FLAGS = sorted(DS_FLAGS)
data_flags = '_'.join(DS_FLAGS) if len(DS_FLAGS) > 0 else 'base'
MULTIPLE_TRANSF = ['crop', 'randaug', 'elastic']
COPY_FLAGS = DS_FLAGS.copy()

for t in MULTIPLE_TRANSF:
    if t in DS_FLAGS:
        COPY_FLAGS.remove(t)
        COPY_FLAGS.append(t + str(MAP_TIMES[t]))
        data_flags = '_'.join(COPY_FLAGS)

subchapter_str = 'subchapters' if SUBCHAPTERS else ''
patterns_dir = os.path.join(root_dir, 'patterns', subchapter_str + data_flags, str(K))
labels_dir = os.path.join(root_dir, 'labels', subchapter_str + data_flags, str(K))
#data_flags = f'resnet50_{data_flags}' if USE_RN50 else f'resnet18_{data_flags}'
features_dir = os.path.join(root_dir, 'features', 
                            'resnet', data_flags, f'resnet50_K{str(K)}' if USE_RN50 else f'resnet18_K{str(K)}')

#rn = 18
#ep = 65
#labels_path = folder_path + 'labels/' +  subchapter_str + data_flags + "/"
#data_flags = f'resnet50_{data_flags}_e{ep}' if USE_RN50 else f'resnet18_{data_flags}_e{ep}'
#features_path = folder_path + f"features/resnet{rn}_blur_each5/resnet{rn}_blur_e{ep}/"

print(features_dir)
print(labels_dir)
if not (os.path.isdir(features_dir) and os.path.isdir(labels_dir)):
    raise FileNotFoundError("No existen directorios de datos para el conjunto de flags seleccionado. Verifique que el dataset exista y, de lo contrario, llame a Split and Augmentation {}".format(
        (os.path.isdir(features_dir), os.path.isdir(labels_dir))))
print("Features set encontrado en {}".format(features_dir))
print("Labels set encontrado en {}".format(labels_dir))

..\features\resnet\base\resnet18_K0
..\labels\base\0
Features set encontrado en ..\features\resnet\base\resnet18_K0
Labels set encontrado en ..\labels\base\0


In [7]:
train_filename = "augmented_train_df.json"
val_filename = "val_df.json"
test_filename = "test_df.json"

In [8]:
features_train = pd.read_json(os.path.join(features_dir, train_filename), orient='index')
features_val = pd.read_json(os.path.join(features_dir, val_filename), orient='index')
features_test = pd.read_json(os.path.join(features_dir, test_filename), orient='index')

labels_train = pd.read_json(os.path.join(labels_dir, train_filename), orient='index')
labels_val = pd.read_json(os.path.join(labels_dir, val_filename), orient='index')
labels_test = pd.read_json(os.path.join(labels_dir, test_filename), orient='index')

In [9]:
pruner = KunischPruner(LABELS_IN_STUDY)
with open(os.path.join('..', 'labels', f'top_{LABELS_IN_STUDY}L.pickle'), 'rb') as f:
    top_labels = pickle.load(f)
pruner.set_top_labels(top_labels)

## Exporting Data with Features

In [40]:
X_train = features_train
X_val = features_val
X_test = features_test   # labels_test_val since val examples are unrecognized to multilabel algorithms
Y_train = pruner.filter_df(labels_train) # reduce labels to most freq
Y_val = pruner.filter_df(labels_val) # reduce labels to most freq
Y_test = pruner.filter_df(labels_test) # in both train and test

20 26
Aplicando threshold 20 para trabajar con 26 labels


In [41]:
X = X_train.to_numpy()
Xt = X_test.to_numpy()
Y = Y_train.to_numpy()
Yt = Y_test.to_numpy()
print(X.shape)
print(Y.shape)
print(Xt.shape)
print(Yt.shape)
# TO SLEEC
#savemat(f"{data_flags}.mat", dict(X=X, Xt=Xt, Y=Y, Yt=Yt))

(504, 512)
(504, 26)
(194, 512)
(194, 26)


In [42]:
# TO C2AE
import pickle
with open('kunisch-train-features.pkl', 'wb') as f: pickle.dump(X, f)
with open('kunisch-train-labels.pkl', 'wb') as f: pickle.dump(Y, f)
with open('kunisch-test-features.pkl', 'wb') as f: pickle.dump(Xt, f)
with open('kunisch-test-labels.pkl', 'wb') as f: pickle.dump(Yt, f)

### Results

|    **Modelo**   	| **Tradicional** 	| **C2AE** 	|
|:---------------:	|:---------------:	|:--------:	|
| Precision Micro 	|       0.08      	|   0.14   	|
| Precision Macro 	|       0.08      	|   0.07   	|
|   Recall Micro  	|       0.62      	|   0.65   	|
|   Recall Macro  	|       0.41      	|   0.31   	|
|     F1 Micro    	|       0.15      	|   0.24   	|
|     F1 Macro    	|       0.12      	|   0.11   	|

## Exporting Data with Patterns as Features

In [10]:
features_dir = os.path.join(root_dir, 'features', 
                            'patterns', data_flags, f'K{str(K)}')
os.makedirs(features_dir, exist_ok=True)

In [16]:
Y_train = pruner.filter_df(labels_train) # reduce labels to most freq
Y_test = pruner.filter_df(labels_test) # in both train and test
Y_val = pruner.filter_df(labels_val)

images_train = {}
images_val = {}
images_test = {}
datasets = {'train': images_train,
            'val': images_val,
            'test': images_test}

# cargar imagenes con indice como llave
# hacer dataframe 
# ordenar indices en labels y imagenes para que queden en la misma relacion
# desordenarlos de alguna forma consistente
# guardarlos
for dataset in  datasets.keys():
    print(dataset)
    for chapter in os.listdir(os.path.join(patterns_dir, dataset)):
        for file in os.listdir(os.path.join(patterns_dir, dataset, chapter)):
            img = cv2.imread(os.path.join(patterns_dir, dataset, chapter, file))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img = cv2.resize(img, (227, 227))
            img = cv2.normalize(img, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_64F)
            img = img.flatten()
            img_name = file.split('.')[0]
            datasets[dataset][img_name] = img
            
print(len(images_train.values()))

train
val
test
504


In [17]:
df_train = pd.DataFrame.from_dict(images_train, orient='index')
df_val = pd.DataFrame.from_dict(images_val, orient='index')
df_test = pd.DataFrame.from_dict(images_test, orient='index')

In [18]:
labels_train = Y_train.sort_index()
df_train = df_train.sort_index()

labels_val = Y_val.sort_index()
df_val = df_val.sort_index()

labels_test = Y_test.sort_index()
df_test = df_test.sort_index()

idx = np.random.permutation(labels_train.index)
labels_train = labels_train.reindex(idx)
df_train = df_train.reindex(idx)

idx = np.random.permutation(labels_val.index)
labels_val = labels_val.reindex(idx)
df_val = df_val.reindex(idx)

idx = np.random.permutation(labels_test.index)
labels_test = labels_test.reindex(idx)
df_test = df_test.reindex(idx)

In [19]:
df_train.values.shape

(504, 51529)

In [20]:
#import pickle
#with open(os.path.join(features_dir, 'kunisch-train-features.pkl'), 'wb') as f: pickle.dump(df_train.values, f)
#with open(os.path.join(features_dir, 'kunisch-train-labels.pkl'), 'wb') as f: pickle.dump(labels_train.values, f)
#with open(os.path.join(features_dir, 'kunisch-test-features.pkl'), 'wb') as f: pickle.dump(df_test.values, f)
#with open(os.path.join(features_dir, 'kunisch-test-labels.pkl'), 'wb') as f: pickle.dump(labels_test.values, f)
df_train.to_json(os.path.join(features_dir, train_filename), orient='index')
df_val.to_json(os.path.join(features_dir, val_filename), orient='index')
df_test.to_json(os.path.join(features_dir, test_filename), orient='index')