In [14]:
import re
import nltk
import torch
import fasttext
import unicodedata
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from os import putenv
from tqdm import trange
from torch import Tensor
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from torch.nn.functional import one_hot
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

#Para usar AMD GPU's con ROC
putenv("HSA_OVERRIDE_GFX_VERSION", "10.3.0")

EXPERIMENTAR = True
NUM_CLASES = 2
RANDOM_STATE = 45
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

nltk.download('stopwords')
STOPWORDS = stopwords.words("spanish")

[nltk_data] Downloading package stopwords to /home/luis-
[nltk_data]     beto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h1>Arquitectura de capas completamente conectadas</h1>

In [2]:
class FeedForwardNeuralNetwork(nn.Module):
    def __init__(self, input_size: int, hiden_sizes: list[int], output_size: int):
        super().__init__()
        self.fcl = nn.ModuleList()
        self.act = nn.ModuleList()
        self.drop = nn.ModuleList()
        self.drop.append(nn.Dropout(0.2)) #Input layer
        
        # Las capas se forman por pares de numeros, en total 1 par menos que la lista neuronas
        neuronas = [input_size] + hiden_sizes + [output_size]
        for i in range(len(neuronas) - 1):
            self.fcl.append(nn.Linear(neuronas[i], neuronas[i + 1]))
            nn.init.xavier_uniform_(self.fcl[i].weight)                                                                                                 # type: ignore
            nn.init.zeros_(self.fcl[i].bias)                                                                                                            # type: ignore
        #Al aplicar CrossEntropy se necesita una función de activación menos que de capas
        for i in range(len(neuronas) - 2):
            self.act.append(nn.ReLU())
            self.drop.append(nn.Dropout())
    
    def forward(self, X):
        x = X
        for i in range(len(self.act)):
            x = self.drop[i](x)
            x = self.fcl[i](x)
            x = self.act[i](x)
            
        x = self.fcl[-1](x)     #No activar la última capa ni hacer dropout
        return x

<h1>Funciones de utilidad</h1>

In [3]:
def normaliza_texto(input_str,
                    punct=False,
                    accents=False,
                    num=False,
                    max_dup=2):
    """
        punct=False (elimina la puntuación, True deja intacta la puntuación)
        accents=False (elimina los acentos, True deja intactos los acentos)
        num= False (elimina los números, True deja intactos los acentos)
        max_dup=2 (número máximo de símbolos duplicados de forma consecutiva, rrrrr => rr)
    """
    PUNCTUACTION = ";:,.\\-\"'/"
    SYMBOLS = "()[]¿?¡!{}~<>|"
    NUMBERS= "0123456789"
    SKIP_SYMBOLS = set(PUNCTUACTION + SYMBOLS)

    nfkd_f = unicodedata.normalize('NFKD', input_str)
    n_str = []
    c_prev = ''
    cc_prev = 0
    for c in nfkd_f:
        if not num:
            if c in NUMBERS:
                continue
        if not punct:
            if c in SKIP_SYMBOLS:
                continue
        if not accents and unicodedata.combining(c):
            continue
        if c_prev == c:
            cc_prev += 1
            if cc_prev >= max_dup:
                continue
        else:
            cc_prev = 0
        n_str.append(c)
        c_prev = c
    texto = unicodedata.normalize('NFKD', "".join(n_str))
    texto = re.sub(r'(\s)+', r' ', texto.strip(), flags=re.IGNORECASE)
    return texto

def eliminar_stopwords(texto: str):
    tokens = [t for t in texto.split() if t not in STOPWORDS]
    return ' '.join(tokens)
    
def aplicar_stemming(texto: str):
    stemmer = SnowballStemmer("spanish")
    tokens = [stemmer.stem(t) for t in texto.split()]
    return ' '.join(tokens)

def preprocesar(texto: str):
    texto = normaliza_texto(texto)
    return texto

def leer_datos(filename, raw=False):
    dataset = pd.read_json(filename, lines=True)
    X = dataset['text']
    if not raw:
        X = X.to_numpy()
    Y = dataset['klass'].to_numpy()
    return X, Y

def leer_datos_embeddings(filename, variante: str):
    dataset = pd.read_json(filename, lines=True)
    
    Y = dataset['klass'].to_numpy()
    
    if variante == 'MX':
        X = dataset['we_mx'].to_numpy()
    elif variante == 'ES':
        X = dataset['we_es'].to_numpy()
    else:
        X = dataset['we_ft'].to_numpy()
    return np.array(X.tolist()), Y

def vectorizar_TF_IDF(X: list, ngram_range=(1,2), max_features=None):   
    vec = TfidfVectorizer(analyzer='word', preprocessor=preprocesar, ngram_range=ngram_range, max_features=max_features)
    X_ = [vec.fit_transform(X[0])]
    for x in X[1:]:
        X_.append(vec.transform(x))
    return X_

def vectorizar_embeddings(X: list, variante: str):
    if variante == 'MX':
        ft = fasttext.load_model('./fasttext/MX.bin')
    elif variante == 'ES':
        ft = fasttext.load_model('./fasttext/ES.bin')
    elif variante == 'GEN':
        ft = fasttext.load_model('./fasttext/cc.es.300.bin')
    
    res = []
    i=0
    for lista in X:
        print('Antes' ,i)
        X_ = lista.map(lambda x : ft.get_sentence_vector(x)) # type: ignore
        X_ = np.vstack(X_.to_numpy())
        res.append(X_)
        print('DEspues' ,i)
        i+=1
    return res
    
def split(X, Y, test=0, test_size=0.2, val=0, val_size=0.1):
    if test > 0:
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, stratify=Y, random_state=RANDOM_STATE)
        if val > 0:
            X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=val_size, stratify=Y_train, random_state=RANDOM_STATE)
            return X_train, Y_train, X_test, Y_test, X_val, Y_val
        else:
            return X_train, Y_train, X_test, Y_test
    elif val > 0:
        X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=val_size, stratify=Y, random_state=RANDOM_STATE)
        return X_train, Y_train, X_val, Y_val

def balancear(X, Y):
    sampler = SMOTE(sampling_strategy='minority')   
    return sampler.fit_resample(X, Y)
    
def torchificar(X: list, Y: list, representacion: str, one_hot_encoding=True, decode=False):
    X_torch = []
    Y_torch = []
    if decode:
        le = LabelEncoder()
    
    for x in X:
        if representacion == 'TF-IDF':
            x_np = x.toarray().astype(np.float32)
        elif representacion == 'Embeddings':
            x_np = x.astype(np.float32)
        x_torch = torch.from_numpy(x_np)                                                                                                            # type: ignore
        if torch.cuda.is_available():
            x_torch = x_torch.cuda()
        X_torch.append(x_torch)
    
    for y in Y:
        if decode:
            y_ = le.fit_transform(y)                                                                                                                # type: ignore
        else:
            y_ = y
        y_torch = torch.from_numpy(y_)
        if torch.cuda.is_available():
            y_torch = y_torch.cuda()
        Y_torch.append(y_torch)
    
    if one_hot_encoding:
        Y_torch[0] = one_hot(Y_torch[0], num_classes=NUM_CLASES).float()
    
    return X_torch, Y_torch

def create_minibatches(X, Y, batch_size, X2=None):
    if X2 == None:
        dataset = TensorDataset(X, Y)
    else:
        dataset = TensorDataset(X, X2, Y)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return loader

def entrenar(X_train, Y_train, modelo: nn.Module, optimizador: optim.Optimizer, funcion_perdida,
    evaluar=True,
    X_val = None,
    Y_val = None,
    verbose=False,
    epocas=100,
    batch_size=32,
    error_minimo=0.02,
    rango_epocas=5,
    umbral_mejora=0.003
    ):
    
    historial_perdida = []
    historial_f1 = []
    
    if torch.cuda.is_available():
        modelo.cuda()
            
    for _ in trange(epocas, desc=f'Entrenando'):
        modelo.train()
        lossTotal = 0
        dataloader = create_minibatches(X_train, Y_train, batch_size=batch_size)
        
        for X_tr, Y_tr in dataloader:
            optimizador.zero_grad()
            Y_pred = modelo(X_tr)
            loss = funcion_perdida(Y_pred, Y_tr)
            Y_pred = torch.softmax(Y_pred, dim=1)
            Y_pred = torch.argmax(Y_pred, dim=1)
            lossTotal += loss.item()
            loss.backward()
            optimizador.step()
                    
        perdida = lossTotal/len(dataloader)
        historial_perdida.append(perdida)
        if evaluar:
            f1, _, _, _ = test(X_val, Y_val, modelo)
            historial_f1.append(f1)
        
            if verbose:
                print('Pérdida: ', perdida)
                print('F1-score: ', f1)
        
        stop = detener_entrenamiento(historial_perdida, error_minimo, rango_epocas, umbral_mejora)
        if stop != None:
            print(stop)
            break
    
    return historial_perdida, historial_f1

def entrenar_fused(X_train_tfidf, X_train_we, Y_train, modelo: nn.Module, optimizador: optim.Optimizer, funcion_perdida,
    evaluar=True,
    X_val_tfidf = None,
    X_val_we = None,
    Y_val = None,
    verbose=False,
    epocas=100,
    batch_size=32,
    error_minimo=0.02,
    rango_epocas=5,
    umbral_mejora=0.003
    ):
    
    historial_perdida = []
    historial_f1 = []
    
    if torch.cuda.is_available():
        modelo.cuda()
            
    for _ in trange(epocas, desc=f'Entrenando'):
        modelo.train()
        lossTotal = 0
        dataloader = create_minibatches(X_train_tfidf, Y_train, batch_size=batch_size, X2=X_train_we)
        
        for X_tr_tfidf, X_tr_we, Y_tr in dataloader:
            optimizador.zero_grad()
            Y_pred = modelo(X_tr_tfidf, X_tr_we)
            loss = funcion_perdida(Y_pred, Y_tr)
            Y_pred = torch.softmax(Y_pred, dim=1)
            Y_pred = torch.argmax(Y_pred, dim=1)
            lossTotal += loss.item()
            loss.backward()
            optimizador.step()
                    
        perdida = lossTotal/len(dataloader)
        historial_perdida.append(perdida)
        if evaluar:
            f1, _, _, _ = test_fused(X_val_tfidf, X_val_we, Y_val, modelo)
            historial_f1.append(f1)
        
            if verbose:
                print('Pérdida: ', perdida)
                print('F1-score: ', f1)
        
        stop = detener_entrenamiento(historial_perdida, error_minimo, rango_epocas, umbral_mejora)
        if stop != None:
            print(stop)
            break
    
    return historial_perdida, historial_f1

def detener_entrenamiento(historial_perdida: list, error_minimo, rango_epocas, umbral_mejora):
    motivo_stop = None
    
    if historial_perdida[-1] < error_minimo:
        motivo_stop = 'Paro por error mínimo'
    elif len(historial_perdida) > 2 * rango_epocas:
        error_rango_anterior = np.mean(historial_perdida[-2 * rango_epocas : -rango_epocas])
        error_rango_actual = np.mean(historial_perdida[-rango_epocas:])
        mejora = error_rango_anterior - error_rango_actual
        if mejora < umbral_mejora:
            motivo_stop = 'Paro por falta de mejora'
    
    return motivo_stop

def test(X, Y, modelo: nn.Module, verbose=False, pred=False):
    modelo.eval()
    with torch.no_grad():
        Y_pred = modelo(X)
        Y_pred = torch.softmax(Y_pred, dim=1)
        Y_pred = torch.argmax(Y_pred, dim=1)
        
        if pred:
            return Y_pred
        else:
            if torch.cuda.is_available():
                Y_ = Y.cpu()
                Y_pred = Y_pred.cpu()
            else:
                Y_ = Y
            
            f1 = f1_score(Y_, Y_pred, average='macro')
            a = accuracy_score(Y_, Y_pred)
            p = precision_score(Y_, Y_pred, average='macro')
            r = recall_score(Y_, Y_pred, average='macro')
            if verbose:
                print("F1-score: ", f1)
            return f1, a, p, r

def test_fused(X_tfidf, X_we, Y, modelo: nn.Module, verbose=False, pred=False):
    modelo.eval()
    with torch.no_grad():
        Y_pred = modelo(X_tfidf, X_we)
        Y_pred = torch.softmax(Y_pred, dim=1)
        Y_pred = torch.argmax(Y_pred, dim=1)
        
        if pred:
            return Y_pred
        else:
            if torch.cuda.is_available():
                Y_ = Y.cpu()
                Y_pred = Y_pred.cpu()
            else:
                Y_ = Y
            
            f1 = f1_score(Y_, Y_pred, average='macro')
            a = accuracy_score(Y_, Y_pred)
            p = precision_score(Y_, Y_pred, average='macro')
            r = recall_score(Y_, Y_pred, average='macro')
            if verbose:
                print("F1-score: ", f1)
            return f1, a, p, r

def guardar_modelo(modelo_state: dict, caracteristicas: dict, filename_modelo: str, 
    filename_caracteristicas: str='./Resultados/DiccionarioModelos.txt', 
    file_mode='a'
    ):
    torch.save(modelo_state, filename_modelo)
    with open(filename_caracteristicas, file_mode) as file:
        file.write(f'{caracteristicas}\n')

def cargar_modelo(filename, caracteristicas: dict, gpu=True):
    modelo = FeedForwardNeuralNetwork(caracteristicas['Entradas'], caracteristicas['Arquitectura'], caracteristicas['Salidas'])
    modelo.load_state_dict(torch.load(filename, weights_only=True))
    if torch.cuda.is_available() and gpu:
        modelo.cuda()
    return modelo

def graficar(data, titulo, x_label, y_label, filename, color='aqua'):
    plt.figure(figsize=(16, 9))
    plt.title(titulo)
    plt.plot(np.arange(len(data)), data,  color=color, linestyle='-', linewidth=1, label=y_label)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.grid(True)
    plt.savefig(filename)
    plt.close()

def guardar_predicciones(Y: Tensor, filename: str):
    if torch.cuda.is_available():
        Y = Y.cpu()
    Y_np = Y.numpy()
    Y_df = pd.DataFrame({
        'id':range(1, len(Y_np) + 1),
        'klass':Y_np
        })
    Y_df.to_csv(filename, index=False)
    
def graficar_conjunto(listas_modelos, nombres_labels, titulo, filename, epocas, opacidad=0.8, error_max=100_000, desc=''):
    colores = ['black', 'darkred', 'forestgreen', 'darkviolet', 'darkorange', 'royalblue']
    marcadores = ['solid', (0, (3, 1, 1, 1)), 'dotted', (0, (5, 1)), 'dashed', (0, (5, 10)), (5, (10, 3)), (0, (3, 5, 1, 5))]
    i = 0
    offset = len(colores) - len(listas_modelos)
    patches = []
    plt.figure(figsize=(16, 9))
    plt.title(titulo)
    n = 0
    for lista in listas_modelos:    
        color = colores[i + offset]
        for diccionario in lista:
            data = diccionario['HistorialPerdida']
            if np.array(data).max() > error_max:
                n += 1
            else:
                epocas_modelo = len(data)
                epocas_faltantes = epocas - epocas_modelo
                plt.plot(np.arange(epocas), data + [None]*epocas_faltantes, color=color, linestyle=marcadores[i], linewidth=1, alpha=opacidad)
        patches.append(mpatches.Patch(color=color, label=nombres_labels[i]))
        i += 1
    plt.figtext(0.13, 0.03, f'{desc}')
    plt.legend(handles=patches)
    plt.xlabel('Época')
    plt.ylabel('Cross Entropy Loss')
    plt.grid(True)
    plt.savefig(filename)
    plt.close()

def promediar_resultados(listas_diccionarios, valores_posibles):
    avgs = []
    for lista in listas_diccionarios:
        avg_f1 = [diccionario['HistorialF1'][-1] for diccionario in lista]
        avgs.append(avg_f1)
    avgs = np.array(avgs)
    avgs = avgs.mean(axis=1)
    print(avgs)
    print(f'Mejor valor {avgs.max()} -> {valores_posibles[avgs.argmax()]}')

<h2>Experimentación</h2>

In [None]:
epocas = 200
lr = 0.001
batchs = [8, 16, 32, 64, 128]
arquitectura = [64, 64, 64, 64, 64]
vec = 'TF-IDF'
features = 5_000

k_folds = 8
version = '1.1.3'
n_exp = 0

filename_train = './dataset_humor_train.json'
filename_test = './dataset_humor_test.json'
filename_grafica = f'./Resultados/Graficas/Shaw_v{version}'
filename_modelo = f'./Resultados/Modelos/Shaw_v{version}'
filename_prediccion =  f'./Resultados/Predicciones/Shaw_v{version}'
filename_temp = f'./Resultados/Temp/Shaw_v{version}'

diccionarios_k_folds = []
listas_diccionarios = []
Y_pred = []

skf = StratifiedKFold(n_splits=k_folds)
X, Y = leer_datos(filename_train)
X_test_original, _ = leer_datos(filename_test)

for batch in batchs:
    for k, (index_train, index_test) in enumerate(skf.split(X, Y), start=1):                                                            # type: ignore
        X_train, Y_train = X[index_train], Y[index_train]                                                                               # type: ignore
        X_val, Y_val = X[index_test], Y[index_test]                                                                                     # type: ignore
        Xs = vectorizar_TF_IDF([X_train, X_val, X_test_original], max_features=features)                                                 # type: ignore
        X_smote, Y_smote = balancear(Xs[0], Y_train)                                                                                    # type: ignore
        Xs_torch, Ys_torch = torchificar([X_smote, Xs[1], Xs[2]], [Y_smote, Y_val], vec, one_hot_encoding=True)                         # type: ignore
        X_train = Xs_torch[0]
        Y_train = Ys_torch[0]
        X_val = Xs_torch[1]
        Y_val = Ys_torch[1]
        X_test = Xs_torch[2]
        del Xs_torch
        del Ys_torch
        modelo = FeedForwardNeuralNetwork(X_train.shape[1], arquitectura, NUM_CLASES)
        funcion_perdida = nn.CrossEntropyLoss()
        optimizador = optim.Adam(modelo.parameters(), lr=lr)
        caracteristicas = {
                        'Version':version,
                        'Vectorizacion':vec,
                        'Entradas':X_train.shape[1],
                        'Arquitectura':arquitectura,
                        'Salidas':NUM_CLASES,
                        'LearningRate':lr,
                        'BatchSize':batch,
                        'CapasOcultas':len(arquitectura),
                        'K-Folds':k_folds,
                    }
        historial_perdida, historial_f1 = entrenar(
            X_train,
            Y_train,
            modelo,
            optimizador=optimizador,
            funcion_perdida=funcion_perdida,
            epocas=epocas,
            batch_size=batch,
            error_minimo=0.0001,
            rango_epocas=15,
            umbral_mejora=0.000001,
            X_val=X_val,
            Y_val=Y_val
            )
        print(f'F1: {historial_f1[-1]}')
        caracteristicas.update({
                        'K':k,
                        'HistorialPerdida':historial_perdida,
                        'HistorialF1':historial_f1,
                    })
        diccionarios_k_folds.append(caracteristicas)
    listas_diccionarios.append(diccionarios_k_folds)
    diccionarios_k_folds = []
    n_exp += 1

In [None]:
graficar_conjunto(
    listas_modelos=listas_diccionarios,
    nombres_labels=batchs,
    titulo='Shaw v1 por batch size',
    filename=filename_grafica + '_batchs.png',
    epocas=epocas
    )
promediar_resultados(listas_diccionarios, batchs)

<h2>Predicción</h2>

In [None]:
epocas = 1000
lr = 0.001
batch = 32
arquitectura = [64, 64, 64]
vec = 'TF-IDF'
features = 5_000

version = '1.0.1'

filename_grafica = f'./Resultados/Graficas/Shaw_v{version}.png'
filename_modelo = f'./Resultados/Modelos/Shaw_v{version}.pth'
filename_prediccion =  f'./Resultados/Predicciones/Shaw_v{version}.csv'

X, Y = leer_datos(filename_train)
X_test_original, _ = leer_datos(filename_test)
Xs = vectorizar_TF_IDF([X, X_test_original], max_features=features)                                                                      # type: ignore
X_smote, Y_smote = balancear(Xs[0], Y)                                                                                        # type: ignore
Xs_torch, Ys_torch = torchificar([X_smote, Xs[1]], [Y_smote], vec, one_hot_encoding=True)                                   # type: ignore
X_train = Xs_torch[0]
Y_train = Ys_torch[0]
X_test = Xs_torch[1]
del Xs_torch
del Ys_torch
modelo = FeedForwardNeuralNetwork(X_train.shape[1], arquitectura, NUM_CLASES)
funcion_perdida = nn.CrossEntropyLoss()
optimizador = optim.Adam(modelo.parameters(), lr=lr)
caracteristicas = {
                'Version':version,
                'Vectorizacion':vec,
                'Entradas':X_train.shape[1],
                'Arquitectura':arquitectura,
                'Salidas':NUM_CLASES,
                'LearningRate':lr,
                'BatchSize':batch,
                'CapasOcultas':len(arquitectura)
            }
historial_perdida, _ = entrenar(
    X_train,
    Y_train,
    modelo,
    evaluar=False,
    optimizador=optimizador,
    funcion_perdida=funcion_perdida,
    epocas=epocas,
    batch_size=batch,
    error_minimo=0.0001,
    rango_epocas=15,
    umbral_mejora=0.000001
    )
Y_pred = test(X_test, [], modelo, pred=True) # type: ignore
caracteristicas.update({
                'HistorialPerdida':historial_perdida,
                'Y_pred':Y_pred.cpu().numpy().tolist() # type: ignore
            })
guardar_modelo(
    modelo.state_dict(), 
    caracteristicas, 
    filename_modelo=filename_modelo, 
    filename_caracteristicas='./Resultados/DiccionarioModelos.txt'
    )
guardar_predicciones(Y_pred, filename_prediccion)   # type: ignore

<h1>Arquitectura convolucional</h1>

In [None]:
class ConvNeuralNetwork(nn.Module):
    def __init__(self, input_size: int, hidden_sizes: list[int], output_size: int, batch):
        super().__init__()
        self.drop_input = nn.Dropout(0.2)
        self.drop_hidden = nn.Dropout()
        self.relu = nn.ReLU()
        
        padding = 0
        kernel_size = 3
        stride = 1
        dilation = 1
        self.conv1 = nn.Conv1d(1, out_channels=batch, kernel_size=kernel_size, stride= stride, padding=padding, dilation=dilation)
        L1 = ((input_size + 2*padding - dilation * (kernel_size-1) -1 ) // stride ) + 1
        
        self.fcl1 = nn.Linear(batch * L1, hidden_sizes[0])
        self.fcl2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.fcl3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.fcl4 = nn.Linear(hidden_sizes[2], hidden_sizes[3])
        self.fcl5 = nn.Linear(hidden_sizes[3], hidden_sizes[4])
        self.output = nn.Linear(hidden_sizes[4], output_size)
        
        nn.init.xavier_uniform_(self.fcl1.weight)
        nn.init.xavier_uniform_(self.fcl2.weight)
        nn.init.xavier_uniform_(self.fcl3.weight)
        nn.init.xavier_uniform_(self.fcl4.weight)
        nn.init.xavier_uniform_(self.fcl5.weight)
        nn.init.xavier_uniform_(self.output.weight)
        
        nn.init.zeros_(self.fcl1.bias)
        nn.init.zeros_(self.fcl2.bias)
        nn.init.zeros_(self.fcl3.bias)
        nn.init.zeros_(self.fcl4.bias)
        nn.init.zeros_(self.fcl5.bias)
        nn.init.zeros_(self.output.bias)
    
    def forward(self, X):
        x = X
        x = self.relu(self.conv1(x.unsqueeze(1)))
        x = x.view(x.size(0), -1)  # Aplanar
        x = self.relu(self.fcl1(self.drop_input(x)))
        x = self.relu(self.fcl2(self.drop_hidden(x)))
        x = self.relu(self.fcl3(self.drop_hidden(x)))
        x = self.relu(self.fcl4(self.drop_hidden(x)))
        x = self.relu(self.fcl5(self.drop_hidden(x)))
        x = self.output(x)
        return x

<h2>Experimentación</h2>

In [None]:
epocas = 200
lr = 0.001
batchs = [128]
arquitectura = [64, 64, 64, 64, 64]
vec = 'TF-IDF'
features = 5_000

k_folds = 8
version = '2'
n_exp = 0

filename_train = './dataset_humor_train.json'
filename_test = './dataset_humor_test.json'
filename_grafica = f'./Resultados/Graficas/Shaw_v{version}'
filename_modelo = f'./Resultados/Modelos/Shaw_v{version}'
filename_prediccion =  f'./Resultados/Predicciones/Shaw_v{version}'
filename_temp = f'./Resultados/Temp/Shaw_v{version}'

diccionarios_k_folds = []
listas_diccionarios = []
Y_pred = []

skf = StratifiedKFold(n_splits=k_folds)
X, Y = leer_datos(filename_train)
X_test_original, _ = leer_datos(filename_test)

for batch in batchs:
    for k, (index_train, index_test) in enumerate(skf.split(X, Y), start=1):                                                            # type: ignore
        X_train, Y_train = X[index_train], Y[index_train]                                                                               # type: ignore
        X_val, Y_val = X[index_test], Y[index_test]                                                                                     # type: ignore
        Xs = vectorizar_TF_IDF([X_train, X_val, X_test_original], max_features=features)                                                 # type: ignore
        X_smote, Y_smote = balancear(Xs[0], Y_train)                                                                                    # type: ignore
        Xs_torch, Ys_torch = torchificar([X_smote, Xs[1], Xs[2]], [Y_smote, Y_val], vec, one_hot_encoding=True)                         # type: ignore
        X_train = Xs_torch[0]
        Y_train = Ys_torch[0]
        X_val = Xs_torch[1]
        Y_val = Ys_torch[1]
        X_test = Xs_torch[2]
        del Xs_torch
        del Ys_torch
        modelo = ConvNeuralNetwork(X_train.shape[1], arquitectura, NUM_CLASES)
        funcion_perdida = nn.CrossEntropyLoss()
        optimizador = optim.Adam(modelo.parameters(), lr=lr)
        caracteristicas = {
                        'Version':version,
                        'Vectorizacion':vec,
                        'Entradas':X_train.shape[1],
                        'Arquitectura':arquitectura,
                        'Salidas':NUM_CLASES,
                        'LearningRate':lr,
                        'BatchSize':batch,
                        'CapasOcultas':len(arquitectura),
                        'K-Folds':k_folds,
                    }
        historial_perdida, historial_f1 = entrenar(
            X_train,
            Y_train,
            modelo,
            optimizador=optimizador,
            funcion_perdida=funcion_perdida,
            epocas=epocas,
            batch_size=batch,
            error_minimo=0.0001,
            rango_epocas=15,
            umbral_mejora=0.000001,
            X_val=X_val,
            Y_val=Y_val
            )
        print(f'F1: {historial_f1[-1]}')
        caracteristicas.update({
                        'K':k,
                        'HistorialPerdida':historial_perdida,
                        'HistorialF1':historial_f1,
                    })
        diccionarios_k_folds.append(caracteristicas)
    listas_diccionarios.append(diccionarios_k_folds)
    diccionarios_k_folds = []
    n_exp += 1

<h1>Arquitectura de fusión simple</h1>

In [5]:
class FusedNeuralNetwork(nn.Module):
    def __init__(self, input_size_tfidf: int, input_size_embeddings: int, hidden_sizes: list[int], output_size: int):
        super().__init__()
        self.fcl = nn.ModuleList()
        self.act = nn.ModuleList()
        self.drop = nn.ModuleList()
        self.drop.append(nn.Dropout(0.2)) #Input layers
        # 2 capas de input
        self.fcl_tfidf = nn.Linear(input_size_tfidf, hidden_sizes[0] // 2)
        self.fcl_embeddings = nn.Linear(input_size_embeddings, hidden_sizes[0] // 2)
        
        # Las capas se forman por pares de numeros, en total 1 par menos que la lista neuronas
        neuronas = hidden_sizes + [output_size]
        for i in range(len(neuronas) - 1):
            self.fcl.append(nn.Linear(neuronas[i], neuronas[i + 1]))
            nn.init.xavier_uniform_(self.fcl[i].weight)                                                                                                 # type: ignore
            nn.init.zeros_(self.fcl[i].bias)                                                                                                            # type: ignore
        #Al aplicar CrossEntropy se necesita una función de activación menos que de capas
        for i in range(len(neuronas) - 2):
            self.act.append(nn.ReLU())
            self.drop.append(nn.Dropout())
    
    def forward(self, X_tfidf, X_embeddings):
        x_tfidf = self.act[0](self.fcl_tfidf(X_tfidf))
        x_embeddings = self.act[0](self.fcl_embeddings(X_embeddings))
        x = torch.cat((x_tfidf, x_embeddings), dim=1)
        for i in range(len(self.act)):
            x = self.drop[i](x)
            x = self.fcl[i](x)
            x = self.act[i](x)
            
        x = self.fcl[-1](x)     #No activar la última capa ni hacer dropout
        return x

<h2>Experimentación</h2>

In [11]:
epocas = 500
lr = 0.001
batch = 128
arquitecturas = [[256, 64, 256, 256, 32], [256, 64, 256, 128, 32], [256, 64, 256, 64, 32], [256, 64, 256, 32, 32], [256, 64, 256, 16, 32]]
vec_tfidf = 'TF-IDF'
vec_we = 'Embeddings'
variantes = 'GEN'
features = 300

k_folds = 10
version = '3.6'
n_exp = 0

filename_train = './dataset_humor_train_embeddings.json'
filename_grafica = f'./Resultados/Graficas/Shaw_v{version}'
filename_modelo = f'./Resultados/Modelos/Shaw_v{version}'
filename_prediccion =  f'./Resultados/Predicciones/Shaw_v{version}'
filename_temp = f'./Resultados/Temp/Shaw_v{version}'

diccionarios_k_folds = []
listas_diccionarios = []
Y_pred = []

skf = StratifiedKFold(n_splits=k_folds)
X_we, Y = leer_datos_embeddings(filename_train, vec_we)
X_tfidf, _ = leer_datos(filename_train)

for arquitectura in arquitecturas:
    for k, (index_train, index_test) in enumerate(skf.split(X_tfidf, Y), start=1):                      # type: ignore
        X_train_tfidf, X_train_we, Y_train = X_tfidf[index_train], X_we[index_train], Y[index_train]
        X_val_tfidf, X_val_we,  Y_val = X_tfidf[index_test], X_we[index_test],  Y[index_test]
        Xs_tfidf = vectorizar_TF_IDF([X_train_tfidf, X_val_tfidf], max_features=features)
        X_smote_tfidf, Y_smote = balancear(Xs_tfidf[0], Y_train)                                              # type: ignore
        X_smote_we, _  = balancear(X_train_we, Y_train)                                                         # type: ignore
        Xs_torch_tfidf, Ys_torch = torchificar([X_smote_tfidf, Xs_tfidf[1]], [Y_smote, Y_val], vec_tfidf, one_hot_encoding=True)
        Xs_torch_we, _ = torchificar([X_smote_we, X_val_we], [Y_smote], vec_we, one_hot_encoding=True)
        X_train_tfidf, X_val_tfidf = tuple(Xs_torch_tfidf)
        X_train_we, X_val_we = tuple(Xs_torch_we)
        Y_train, Y_val = tuple(Ys_torch)
        del Xs_torch_tfidf
        del Xs_torch_we
        del Ys_torch
        modelo = FusedNeuralNetwork(X_train_tfidf.shape[1], X_train_we.shape[1], arquitectura, NUM_CLASES)
        funcion_perdida = nn.CrossEntropyLoss()
        optimizador = optim.Adam(modelo.parameters(), lr=lr)
        caracteristicas = {
                        'Version':version,
                        'Vectorizacion':(vec_tfidf, vec_we),
                        'Entradas':(X_train_tfidf.shape[1], X_train_we.shape[1]),
                        'Arquitectura':arquitectura,
                        'Salidas':NUM_CLASES,
                        'LearningRate':lr,
                        'BatchSize':batch,
                        'CapasOcultas':len(arquitectura) + 1,
                        'K-Folds':k_folds,
                    }
        historial_perdida, historial_f1 = entrenar_fused(
            X_train_tfidf,
            X_train_we,
            Y_train,
            modelo,
            optimizador=optimizador,
            funcion_perdida=funcion_perdida,
            epocas=epocas,
            batch_size=batch,
            error_minimo=0.0001,
            rango_epocas=15,
            umbral_mejora=0.000001,
            X_val_tfidf=X_val_tfidf,
            X_val_we=X_val_we,
            Y_val=Y_val
            )
        print(f'F1: {historial_f1[-1]}')
        caracteristicas.update({
                        'K':k,
                        'HistorialPerdida':historial_perdida,
                        'HistorialF1':historial_f1,
                    })
        diccionarios_k_folds.append(caracteristicas)
    listas_diccionarios.append(diccionarios_k_folds)
    diccionarios_k_folds = []
    n_exp += 1

Entrenando:  32%|███▏      | 158/500 [00:40<01:27,  3.89it/s]

Paro por falta de mejora
F1: 0.790438984270587



Entrenando:  21%|██▏       | 107/500 [00:26<01:37,  4.01it/s]

Paro por falta de mejora
F1: 0.789612097304405



Entrenando:  32%|███▏      | 158/500 [00:38<01:23,  4.11it/s]

Paro por falta de mejora
F1: 0.7852290159982467



Entrenando:  28%|██▊       | 140/500 [00:33<01:25,  4.21it/s]

Paro por falta de mejora
F1: 0.788279531692207



Entrenando:  28%|██▊       | 138/500 [00:32<01:26,  4.21it/s]

Paro por falta de mejora
F1: 0.7860982265922145



Entrenando:  29%|██▉       | 146/500 [00:34<01:24,  4.18it/s]

Paro por falta de mejora
F1: 0.7708550999665948



Entrenando:  31%|███       | 155/500 [00:37<01:22,  4.19it/s]

Paro por falta de mejora
F1: 0.7957318712035693



Entrenando:  26%|██▌       | 131/500 [00:31<01:28,  4.18it/s]

Paro por falta de mejora
F1: 0.7625695871309905



Entrenando:  31%|███       | 155/500 [00:36<01:22,  4.20it/s]

Paro por falta de mejora
F1: 0.8045751633986928



Entrenando:  30%|███       | 150/500 [00:35<01:23,  4.21it/s]

Paro por falta de mejora
F1: 0.7686357535251221



Entrenando:  28%|██▊       | 139/500 [00:33<01:26,  4.19it/s]

Paro por falta de mejora
F1: 0.7844633314017126



Entrenando:  25%|██▍       | 123/500 [00:29<01:29,  4.20it/s]

Paro por falta de mejora
F1: 0.7652824707378563



Entrenando:  25%|██▍       | 124/500 [00:29<01:29,  4.22it/s]

Paro por falta de mejora
F1: 0.785736723795381



Entrenando:  26%|██▌       | 130/500 [00:30<01:27,  4.23it/s]

Paro por falta de mejora
F1: 0.7693221559506194



Entrenando:  25%|██▌       | 125/500 [00:29<01:29,  4.18it/s]

Paro por falta de mejora
F1: 0.7750576775185849



Entrenando:  29%|██▉       | 144/500 [00:34<01:24,  4.22it/s]

Paro por falta de mejora
F1: 0.7991114545103342



Entrenando:  26%|██▌       | 129/500 [00:30<01:27,  4.23it/s]

Paro por falta de mejora
F1: 0.791814922376018



Entrenando:  33%|███▎      | 163/500 [00:38<01:19,  4.22it/s]

Paro por falta de mejora
F1: 0.7665858694676362



Entrenando:  35%|███▌      | 175/500 [00:41<01:16,  4.23it/s]

Paro por falta de mejora
F1: 0.78250993344054



Entrenando:  23%|██▎       | 114/500 [00:26<01:31,  4.24it/s]

Paro por falta de mejora
F1: 0.7629489555149402



Entrenando:  25%|██▍       | 124/500 [00:29<01:29,  4.22it/s]

Paro por falta de mejora
F1: 0.7930400010835602



Entrenando:  28%|██▊       | 142/500 [00:33<01:24,  4.23it/s]

Paro por falta de mejora
F1: 0.7793277310924369



Entrenando:  33%|███▎      | 164/500 [00:38<01:19,  4.23it/s]

Paro por falta de mejora
F1: 0.7812950951965102



Entrenando:  21%|██▏       | 107/500 [00:25<01:33,  4.22it/s]

Paro por falta de mejora
F1: 0.7554498027738898



Entrenando:  29%|██▉       | 144/500 [00:34<01:24,  4.23it/s]

Paro por falta de mejora
F1: 0.7874580761885199



Entrenando:  28%|██▊       | 138/500 [00:32<01:24,  4.26it/s]

Paro por falta de mejora
F1: 0.7828801219373773



Entrenando:  26%|██▌       | 130/500 [00:30<01:27,  4.25it/s]

Paro por falta de mejora
F1: 0.7963781920795094



Entrenando:  23%|██▎       | 114/500 [00:26<01:31,  4.22it/s]

Paro por falta de mejora
F1: 0.7596925621442818



Entrenando:  24%|██▎       | 118/500 [00:27<01:30,  4.22it/s]

Paro por falta de mejora
F1: 0.7887057602685562



Entrenando:  24%|██▍       | 121/500 [00:28<01:29,  4.22it/s]

Paro por falta de mejora
F1: 0.7627615031972744



Entrenando:  24%|██▍       | 122/500 [00:28<01:29,  4.22it/s]

Paro por falta de mejora
F1: 0.7935847298977997



Entrenando:  18%|█▊        | 90/500 [00:21<01:37,  4.22it/s]

Paro por falta de mejora
F1: 0.7969486514378156



Entrenando:  30%|██▉       | 148/500 [00:34<01:22,  4.24it/s]

Paro por falta de mejora
F1: 0.7754703402899146



Entrenando:  28%|██▊       | 139/500 [00:32<01:25,  4.24it/s]

Paro por falta de mejora
F1: 0.7680724198510795



Entrenando:  27%|██▋       | 135/500 [00:32<01:26,  4.22it/s]

Paro por falta de mejora
F1: 0.7803163077077522



Entrenando:  30%|██▉       | 149/500 [00:35<01:22,  4.23it/s]

Paro por falta de mejora
F1: 0.8022835799197057



Entrenando:  31%|███       | 154/500 [00:36<01:21,  4.24it/s]

Paro por falta de mejora
F1: 0.7935816803741331



Entrenando:  28%|██▊       | 138/500 [00:32<01:25,  4.23it/s]

Paro por falta de mejora
F1: 0.7718529807844705



Entrenando:  25%|██▌       | 126/500 [00:29<01:28,  4.24it/s]

Paro por falta de mejora
F1: 0.7832005990061943



Entrenando:  25%|██▌       | 125/500 [00:29<01:28,  4.24it/s]

Paro por falta de mejora
F1: 0.7730230106570473



Entrenando:  26%|██▌       | 128/500 [00:30<01:27,  4.26it/s]

Paro por falta de mejora
F1: 0.800038454143434



Entrenando:  26%|██▌       | 131/500 [00:30<01:26,  4.25it/s]

Paro por falta de mejora
F1: 0.7637224298745475



Entrenando:  28%|██▊       | 140/500 [00:33<01:25,  4.22it/s]

Paro por falta de mejora
F1: 0.7846526927169879



Entrenando:  30%|███       | 151/500 [00:35<01:21,  4.26it/s]

Paro por falta de mejora
F1: 0.7721782540365206



Entrenando:  19%|█▉        | 94/500 [00:22<01:36,  4.20it/s]

Paro por falta de mejora
F1: 0.780398084039964



Entrenando:  25%|██▍       | 123/500 [00:29<01:28,  4.24it/s]

Paro por falta de mejora
F1: 0.7873606730609617



Entrenando:  29%|██▉       | 145/500 [00:34<01:23,  4.25it/s]

Paro por falta de mejora
F1: 0.7844788650070406



Entrenando:  27%|██▋       | 136/500 [00:32<01:25,  4.25it/s]

Paro por falta de mejora
F1: 0.7590875755506663



Entrenando:  33%|███▎      | 164/500 [00:37<01:17,  4.34it/s]

Paro por falta de mejora
F1: 0.7906481551728498



Entrenando:  29%|██▉       | 146/500 [00:33<01:20,  4.38it/s]

Paro por falta de mejora
F1: 0.7695359194998704





In [10]:
graficar_conjunto(
    listas_modelos=listas_diccionarios,
    nombres_labels=arquitecturas,
    titulo='Shaw v3 tercera capa',
    filename=filename_grafica + '_tercera_capa.png',
    epocas=epocas
    )
promediar_resultados(listas_diccionarios, arquitecturas)

[0.79026219 0.78320559 0.78204189 0.78426785]
Mejor valor 0.7902621946125264 -> [256, 64, 256, 64, 32]


<h2>Predicción</h2>

In [None]:
epocas = 500
lr = 0.001
batch = 128
arquitectura = [256, 64, 256, 64, 32]
vec_tfidf = 'TF-IDF'
vec_we = 'Embeddings'
variantes = 'GEN'
features = 300

version = '3.5'
intento = 4

filename_test = './dataset_humor_test_embeddings.json'
filename_train = './dataset_humor_train_embeddings.json'
filename_grafica = f'./Resultados/Graficas/Shaw_v{version}_t{intento}'
filename_modelo = f'./Resultados/Modelos/Shaw_v{version}_t{intento}.pth'
filename_prediccion =  f'./Resultados/Predicciones/Shaw_v{version}_t{intento}.csv'
filename_temp = f'./Resultados/Temp/Shaw_v{version}'

X_train_we, Y_train = leer_datos_embeddings(filename_train, vec_we)
X_train_tfidf, _ = leer_datos(filename_train)
X_test_we_original, _ = leer_datos_embeddings(filename_test, vec_we)
X_test_tfidf_original, _ = leer_datos(filename_test)

Xs_tfidf = vectorizar_TF_IDF([X_train_tfidf, X_test_tfidf_original], max_features=features)
X_smote_tfidf, Y_smote = balancear(Xs_tfidf[0], Y_train)                                              # type: ignore
X_smote_we, _  = balancear(X_train_we, Y_train)                                                       # type: ignore

Xs_torch_tfidf, Ys_torch = torchificar([X_smote_tfidf, Xs_tfidf[1]], [Y_smote], vec_tfidf, one_hot_encoding=True)
Xs_torch_we, _ = torchificar([X_smote_we, X_test_we_original], [Y_smote], vec_we, one_hot_encoding=True)
X_train_tfidf, X_test_tfidf = tuple(Xs_torch_tfidf)
X_train_we, X_test_we = tuple(Xs_torch_we)
Y_train = Ys_torch[0]
del Xs_torch_tfidf
del Xs_torch_we
del Ys_torch
modelo = FusedNeuralNetwork(X_train_tfidf.shape[1], X_train_we.shape[1], arquitectura, NUM_CLASES)
funcion_perdida = nn.CrossEntropyLoss()
optimizador = optim.Adam(modelo.parameters(), lr=lr)
caracteristicas = {
                'Version':version,
                'Vectorizacion':(vec_tfidf, vec_we),
                'Entradas':(X_train_tfidf.shape[1], X_train_we.shape[1]),
                'Arquitectura':arquitectura,
                'Salidas':NUM_CLASES,
                'LearningRate':lr,
                'BatchSize':batch,
                'CapasOcultas':len(arquitectura) + 1,
                'Random_seed':RANDOM_STATE
            }
historial_perdida, _ = entrenar_fused(
    X_train_tfidf,
    X_train_we,
    Y_train,
    modelo,
    optimizador=optimizador,
    funcion_perdida=funcion_perdida,
    epocas=epocas,
    batch_size=batch,
    error_minimo=0.0001,
    rango_epocas=15,
    umbral_mejora=0.000001,
    evaluar=False
    )
Y_pred = test_fused(X_test_tfidf, X_test_we, [], modelo, pred=True)
caracteristicas.update({
                'HistorialPerdida':historial_perdida,
                'Y_pred':Y_pred.cpu().numpy().tolist() # type: ignore
            })
guardar_modelo(
    modelo.state_dict(), 
    caracteristicas, 
    filename_modelo=filename_modelo, 
    filename_caracteristicas='./Resultados/DiccionarioModelos.txt'
    )
guardar_predicciones(Y_pred, filename_prediccion)   # type: ignore

Entrenando:  28%|██▊       | 140/500 [00:35<01:30,  3.98it/s]

Paro por falta de mejora



