<a href="https://colab.research.google.com/github/mxag11z/EMO/blob/main/RegresionUsingDictionary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Z2: Mejoraremos la estrategia 2 usando lexicones o diccionarios de palabras que contengan un score de emocion por palabra, usaremos dos distintos y entrenaremos nuestros modelos de regresion en cada uno para ver cual da mejores resultados. Sin embargo para la estrategia 2 debemos tener el conjunto de entrenamiento en español para poder probarlo correctamente en el corpus de test que esta en español. Para el Diccionario NRC no hay problema, pues está en español ya directamente, por otro lado para el de Buecher solo hay en ingles, por lo que tocará traducirlo al español.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
import numpy as np
from sklearn import linear_model
import sklearn.metrics
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
def read_data2(emotion):
    """
    Lee los datos de train y test para una emoción
    """
    # Datos de entrenamiento
    with open(f"/content/drive/MyDrive/PLN project/data/en_train&dev_translatedEs/train/translated_trainToEs_{emotion}.txt", 'r', encoding='utf-8') as f:
        train_X = f.readlines()
    with open(f"/content/drive/MyDrive/PLN project/data/en/train/{emotion}_labels.txt", 'r', encoding='utf-8') as f:
        train_y = [float(line.strip()) for line in f.readlines()]

    # Datos de test
    with open(f"/content/drive/MyDrive/PLN project/data/es/test/{emotion}.txt", 'r', encoding='utf-8') as f:
        test_X = f.readlines()
    with open(f"/content/drive/MyDrive/PLN project/data/es/test/{emotion}_labels.txt", 'r', encoding='utf-8') as f:
        test_y = [float(line.strip()) for line in f.readlines()]

    return (train_X, train_y), (test_X, test_y)

In [5]:
def load_nrc_lexicon(file_path):
    """
    Carga el NRC lexicon español solo para joy, anger, sadness y fear
    """
    lexicon = {}
    valid_emotions = {'joy', 'anger', 'sadness', 'fear'}

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 4:
                _, emotion, score, spanish_word = parts[:4]

                # Solo procesar las emociones que nos interesan
                if emotion.lower() in valid_emotions:
                    if emotion not in lexicon:
                        lexicon[emotion] = {}
                    lexicon[emotion][spanish_word] = float(score)
    return lexicon

In [8]:
import pandas as pd

def load_buechel_lexicon(file_path):
    """
    Carga el Buechel lexicon (formato TSV) solo para Joy, Anger, Sadness y Fear
    """
    lexicon = {}
    valid_emotions = {'Joy', 'Anger', 'Sadness', 'Fear'}

    # Leer el archivo TSV con pandas
    df = pd.read_csv(file_path, sep='\t')

    # Procesar cada emoción válida
    for emotion in valid_emotions:
        emotion_lower = emotion.lower()  # Convertir a minúsculas
        lexicon[emotion_lower] = {}

        # Para cada palabra, guardar su puntuación
        for _, row in df.iterrows():
            word = row['Word']
            try:
                score = float(row[emotion])
                lexicon[emotion_lower][word] = score
            except (ValueError, KeyError):
                continue

    return lexicon

In [7]:
def get_lexicon_features(text, lexicon, emotion):
    """
    Extrae características de un lexicon para un texto
    """
    words = text.lower().split()
    scores = [lexicon[emotion].get(word, 0) for word in words]

    return {
        'max_score': max(scores) if scores else 0,
        'mean_score': np.mean(scores) if scores else 0,
        'sum_score': sum(scores),
        'count_words': sum(1 for score in scores if score > 0)
    }

In [21]:
def train_emotion_regressor_comparison(emotion, alpha=100):
    """
    Compara diferentes combinaciones de lexicones
    """
    # Cargar lexicones
    nrc_lexicon = load_nrc_lexicon("/content/drive/MyDrive/PLN project/data/lexicons/NRC-spanish.txt")
    buechel_lexicon = load_buechel_lexicon("/content/drive/MyDrive/PLN project/data/lexicons/Buechel_spa_lex.tsv")
    ##print(nrc_lexicon);
    ##print(buechel_lexicon);
    # Cargar datos
    (train_X, train_y), (test_X, test_y) = read_data2(emotion)

    # BOW features (base)
    vectorizer = CountVectorizer(
        max_features=10000,
        ngram_range=(1,2),
        lowercase=True,
        strip_accents=None,
        binary=True
    )
    X_train_bow = vectorizer.fit_transform(train_X)
    X_test_bow = vectorizer.transform(test_X)

    # NRC lexicon features
    X_train_nrc = np.array([list(get_lexicon_features(text, nrc_lexicon, emotion).values())
                           for text in train_X])
    X_test_nrc = np.array([list(get_lexicon_features(text, nrc_lexicon, emotion).values())
                          for text in test_X])

    # Buechel lexicon features
    X_train_buechel = np.array([list(get_lexicon_features(text, buechel_lexicon, emotion).values())
                               for text in train_X])
    X_test_buechel = np.array([list(get_lexicon_features(text, buechel_lexicon, emotion).values())
                              for text in test_X])

    # Crear diferentes combinaciones de características
    combinations = {
        'BOW_only': (X_train_bow.toarray(), X_test_bow.toarray()),
        'BOW_NRC': (np.hstack((X_train_bow.toarray(), X_train_nrc)),
                   np.hstack((X_test_bow.toarray(), X_test_nrc))),
        'BOW_Buechel': (np.hstack((X_train_bow.toarray(), X_train_buechel)),
                       np.hstack((X_test_bow.toarray(), X_test_buechel))),
        'BOW_NRC_Buechel': (np.hstack((X_train_bow.toarray(), X_train_nrc, X_train_buechel)),
                           np.hstack((X_test_bow.toarray(), X_test_nrc, X_test_buechel)))
    }

    # Probar cada combinación
    results = {}
    for name, (X_train, X_test) in combinations.items():
        # Entrenar modelo
        model = linear_model.Ridge(alpha=alpha)
        model.fit(X_train, train_y)

        # Evaluar
        preds = model.predict(X_test)
        mae = sklearn.metrics.mean_absolute_error(test_y, preds)
        results[name] = mae

    return results

In [19]:
def compare_all_emotions():
    """
    Compara todas las combinaciones para cada emoción
    """
    emotions = ['joy', 'anger', 'sadness', 'fear']
    all_results = {}

    for emotion in emotions:
        print(f"\n-----Processing {emotion}")
        results = train_emotion_regressor_comparison(emotion)
        all_results[emotion] = results

        # Mostrar resultados para esta emoción
        print(f"\nResultados para {emotion}:")
        for method, mae in results.items():
            print(f"{method}: MAE = {mae:.4f}")

    return all_results

In [22]:
resultados_mejorados = compare_all_emotions()


-----Processing joy

Resultados para joy:
BOW_only: MAE = 0.2212
BOW_NRC: MAE = 0.2203
BOW_Buechel: MAE = 0.2185
BOW_NRC_Buechel: MAE = 0.2178

-----Processing anger

Resultados para anger:
BOW_only: MAE = 0.2254
BOW_NRC: MAE = 0.2231
BOW_Buechel: MAE = 0.2249
BOW_NRC_Buechel: MAE = 0.2232

-----Processing sadness

Resultados para sadness:
BOW_only: MAE = 0.2249
BOW_NRC: MAE = 0.2215
BOW_Buechel: MAE = 0.2254
BOW_NRC_Buechel: MAE = 0.2224

-----Processing fear

Resultados para fear:
BOW_only: MAE = 0.2147
BOW_NRC: MAE = 0.2130
BOW_Buechel: MAE = 0.2149
BOW_NRC_Buechel: MAE = 0.2137
