## IMPORTS

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json
from collections import Counter
import ast
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import snowballstemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

from datetime import datetime
from openpyxl import load_workbook
from openpyxl.utils.exceptions import InvalidFileException

import random

from transformers import (
    BertTokenizer, 
    Trainer, 
    TrainingArguments, 
    BertForSequenceClassification, 
    TrainerCallback
)
import datasets
from datasets import Dataset
import pyarrow as pa
import pyarrow.dataset as ds

import torch
import torch.nn as nn
from torch.utils.data import DataLoader


In [None]:
print('Downlowing stopwords...')
nltk.download('stopwords')
nltk.download('punkt')

## FUNCIONES

Funciones que utilizamos para el propocesamiento de los datos: remove_stopwords, stemming, stemming_turkish y clean

In [None]:
def remove_stopwords(text, language):

    # Cargamos las stopwords
    if language == 'english':
            stop_words = set(stopwords.words('english'))
    elif language == 'spanish':
            stop_words = set(stopwords.words('spanish'))
    elif language == 'turkish':
            stop_words = set(nltk.corpus.stopwords.words('turkish'))
    else:
      print('Language not supported')

    # Eliminamos las stopwords
    cleanTxt = [x for x in text.split() if x not in stop_words]
    return ' '.join(cleanTxt)

def stemming(text, language):
    stemmer = SnowballStemmer(language)
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Definimos las función stemming para el turco
def stemming_turkish(text):
    stemmer_tr = snowballstemmer.stemmer('turkish')
    words = text.split()
    stemmed_words = [stemmer_tr.stemWord(word) for word in words]
    return ' '.join(stemmed_words)

# Define the cleaning function
def clean(text, language):

    if isinstance(text, str):
        cleanTxt = re.sub(r'https?://\S+|www\.\S+', '', text)  # Eliminamos las URLs
        cleanTxt = re.sub(r'#\w+', '', cleanTxt)               # Eliminamos los hashtags
        cleanTxt = re.sub(r'\W+|\d+', ' ', cleanTxt)           # Eliminamos los carácteres y números no alphanumericos and numbers
        cleanTxt = cleanTxt.lower()                            # Minúsculas
    else:
        # Convertimos el elemento a cadena de texto
        text = str(text)
        cleanTxt = re.sub(r'https?://\S+|www\.\S+', '', text)  # Eliminamos las URLs
        cleanTxt = re.sub(r'#\w+', '', cleanTxt)               # Eliminamos los hashtags
        cleanTxt = re.sub(r'\W+|\d+', ' ', cleanTxt)           # Eliminamos los carácteres y números no alphanumericos and numbers
        cleanTxt = cleanTxt.lower()                            # Minúsculas
        
    # Stopwords 
    stop_words = set(stopwords.words(language))
    cleanTxt = ' '.join([word for word in cleanTxt.split() if word not in stop_words])

    # Stemming
    if language == 'turkish':
        cleanTxt = stemming_turkish(cleanTxt)
    else:
        cleanTxt = stemming(cleanTxt, language)

    return cleanTxt


In [None]:
def load_data(path, task, language):
    data = pd.read_csv(path, sep='\t')

    if language == 'english':
        data = data[data['language'] == 'en']

    elif language == 'spanish':
        data = data[data['language'] == 'es']

    else:
      print('not supported')

    data['text']  = data['text'].apply(clean, language=language)
    data = pd.DataFrame({'text': data['text'], 'label': data[task]})     
    if task == 'task1':
        data['label']  = data['label'].map({'sexist': 1, 'non-sexist': 0})  # Convertimos en una variable binaria

    if task == 'task2':
        data['label']  = data['label'].map({'ideological-inequality': 1,    # Convertimos en 6 clases numéricas
                                            'stereotyping-dominance': 2,
                                            'objectification': 3,
                                            'sexual-violence': 4,
                                            'misogyny-non-sexual-violence': 5,
                                            'non-sexist': 0})

    return data


# Cargamos el dataset turco
def load_turkish_data(path, task, language):
    df_exist_turkish = pd.read_csv(path, delimiter=';')
    df_exist_turkish = df_exist_turkish.rename(columns={
        'Annotation_L1_Eng': 'task1',
        'Annotation_L2_Eng': 'task2',
        'Annotation_L1_TR': 'task1_tr',
        'Annotation_L2_TR': 'task2_tr',
        'Text': 'text'
    })

    # Convertimos todos los elementos a minúsculas
    df_exist_turkish = df_exist_turkish.apply(lambda col: col.map(lambda x: x.lower() if isinstance(x, str) else x))

    # Añadimos la columna del idioma
    df_exist_turkish['language'] = 'tr'

    # Renombramos para unificar los datasets
    replacements = {
        'not-sexist': 'non-sexist',
        'anti-feminism': 'ideological-inequality',
        'misogyny': 'misogyny-non-sexual-violence',
        'sexual_violence': 'sexual-violence',
        'stereotyping': 'stereotyping-dominance'
    }
    df_exist_turkish.replace(replacements, inplace=True)

    # Eliminamos las columnas innecesarias
    df_exist_turkish.drop(['task1_tr', 'task2_tr', 'ID'], axis=1, inplace=True)

    df_exist_turkish['text'] = df_exist_turkish['text'].apply(lambda x: clean(x, language))

    data = pd.DataFrame({'text': df_exist_turkish['text'], 'label': df_exist_turkish[task]})        
    if task == 'task1':
        data['label']  = data['label'].map({'sexist': 1, 'non-sexist': 0})  # Convertimos en una variable binaria

    if task == 'task2':
        data['label']  = data['label'].map({'ideological-inequality': 1,    # Convertimos en 6 clases numéricas
                                            'stereotyping-dominance': 2,
                                            'objectification': 3,
                                            'sexual-violence': 4,
                                            'misogyny-non-sexual-violence': 5,
                                            'non-sexist': 0})


    return data


In [None]:
# Cargamos los diferentes tokenizadores
tokenizer_en = BertTokenizer.from_pretrained("bert-base-uncased")
#tokenizer_en = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

tokenizer_es = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
#tokenizer_es = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

tokenizer_tr = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
#tokenizer_tr = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenization(batched_text, language):
    if language == 'english':
        return tokenizer_en(batched_text['text'], padding=True, truncation=True)

    elif language == 'spanish':
        return tokenizer_es(batched_text['text'], padding=True, truncation=True)

    elif language == 'turkish':
        return tokenizer_tr(batched_text['text'], padding=True, truncation=True)

    else:
        raise ValueError("Language not supported")

def formatting_data(df, language):
    data = Dataset(pa.Table.from_pandas(df))                                    # Convertimos el DataFrame en un conjunto de datos de Huggingface
    data = data.map(lambda batch: tokenization(batch, language), batched=True, batch_size=len(data))  # Aplicamos la funcion tokenizador
    data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])  # Establecemos el formatoreturn data
    return data


Creamos una función para guardar los datos en excels

In [None]:
class SaveMetricsCallback(TrainerCallback):
    def __init__(self, file_name):
        self.metrics_data = []
        self.file_name = file_name

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # Guardamos las métricas después de cada epoch
        if metrics is not None:
            self.metrics_data.append(metrics)

    def save_to_excel(self):
        # Creamos un DataFrame con las métricas
        df = pd.DataFrame(self.metrics_data)

        # Definimos la ruta
        file_path = f'{self.file_name}.xlsx'
        valor = random.random()
        sheet_name = f"hoja_{valor}"

        try:
            # Cargamos el libro existente
            try:
                book = load_workbook(file_path)
                # Verificamos si la hoja existe
                if sheet_name in book.sheetnames:
                    # Si la hoja existe la eliminamos
                    del book[sheet_name]
                    book.save(file_path)
                # Guardamos los datos
                with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
                    df.to_excel(writer, sheet_name=sheet_name, index=False)
            except (FileNotFoundError, InvalidFileException, KeyError):
                # Si el libro no se puede cargar o está vacío, creamos uno nuevo
                with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
                    df.to_excel(writer, sheet_name=sheet_name, index=False)

            print(f"Datos guardados en el archivo '{self.file_name}.xlsx'.")

        except PermissionError:
            print(f"Error: No se pudo acceder o guardar en '{self.file_name}.xlsx'.")



Función auxiliar que imprime las métricas que analizaremos en nuestro modelo y nos ayudará a sacar conclusiones

In [None]:
def metrics(predicted):
    labels = predicted.label_ids
    predictions = predicted.predictions.argmax(-1)

    #Calcula la precisión del clasificador
    accuracy = accuracy_score(labels, predictions)
    print('Accuracy:', accuracy)

    report = classification_report(labels, predictions, output_dict=True, zero_division=0)
    precision_per_class = {label: metrics['precision'] for label, metrics in report.items() if isinstance(metrics, dict)}
    recall_per_class = {label: metrics['recall'] for label, metrics in report.items() if isinstance(metrics, dict)}
    f1_score_per_class = {label: metrics['f1-score'] for label, metrics in report.items() if isinstance(metrics, dict)}

    precision = report['weighted avg']['precision']
    recall = recall_score(labels, predictions, average='macro', zero_division=0)
    f1 = report['weighted avg']['f1-score']
   
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}


## BERT



### task 1

## Cargamos los datos

In [None]:
# english
train_load_en = load_data("EXIST_training.tsv", 'task1', 'english')
test_load_en = load_data("EXIST_test_labeled.tsv", 'task1', 'english')
#spanish
train_load_es = load_data("EXIST_training.tsv", 'task1', 'spanish')
test_load_es = load_data("EXIST_test_labeled.tsv",  'task1', 'spanish')
#turkish
df_exist_turkish = load_turkish_data("Dataset_Sexism_Turkish.csv",  'task1', 'turkish')

In [None]:
#separamos el dataset en training y test set
train_load_tr, test_load_tr = train_test_split(df_exist_turkish, test_size=0.4, random_state=42)

In [None]:
train_load_en , validation_load_en  = train_test_split(train_load_en , test_size=0.2, random_state=42) #english
train_load_es , validation_load_es  = train_test_split(train_load_es , test_size=0.2, random_state=42) #spanish
train_load_tr , validation_load_tr  = train_test_split(train_load_tr , test_size=0.2, random_state=42) #turkish

In [None]:
#english
train_en = formatting_data(train_load_en, 'english')
test_en  = formatting_data(train_load_en, 'english' )
validation_en = formatting_data(validation_load_en, 'english' )

#spanish
train_es = formatting_data(train_load_es, 'spanish' )
test_es  = formatting_data(test_load_es, 'spanish' )
validation_es = formatting_data(validation_load_es, 'spanish' )

#turkish
train_tr = formatting_data(train_load_tr, 'turkish' )
test_tr  = formatting_data(test_load_tr, 'turkish' )
validation_tr = formatting_data(validation_load_tr, 'turkish' )

In [None]:
# Cargamos los modelos
#model_en = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased")
model_en = BertForSequenceClassification.from_pretrained("bert-base-uncased")

#model_es = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased")
model_es = BertForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

#model_tr = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased")
model_tr = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased")

In [None]:
# Definimos los arumentos de entrenamientos de los modelos 
training_args = TrainingArguments(
    output_dir='RESULTADOS',
    evaluation_strategy='epoch',
    logging_strategy='steps',  # Para una evaluación más frecuente
    save_strategy='epoch',
    learning_rate=1e-5,  # Tasa de aprendizaje
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,  
    num_train_epochs=5, 
    weight_decay=0.1, 
    logging_dir='LOGS',
    logging_steps=100,  
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    save_total_limit=3,  # Limitamos el número de modelos guardados
    gradient_accumulation_steps=2,  
)

In [None]:
# Metricas
metrics_callback = SaveMetricsCallback(file_name='training_metrics_bert_t1_en')

# Trainer class
trainer_en = Trainer(
    model=model_en,
    args=training_args,
    compute_metrics=metrics,
    train_dataset=train_en,
    eval_dataset=validation_en,
    callbacks=[metrics_callback]
)

# Entrenamos el modelo
print('\n* TRAIN *\n')
trainer_en.train()

# Evaluamos el modelo
print('\n* EVALUATE *\n')
trainer_en.evaluate(test_en)

# Guardamos las métricas
metrics_callback.save_to_excel()

In [None]:
# Guardamos el modelo y el tokenizador
model_name = 'saved_bert_model_task1_en'
model_en.save_pretrained(model_name)
tokenizer_en.save_pretrained(model_name)

In [None]:

metrics_callback = SaveMetricsCallback(file_name='training_metrics_bert_t1_es')

trainer_es = Trainer(
    model=model_es,
    args=training_args,
    compute_metrics=metrics,
    train_dataset=train_es,
    eval_dataset=validation_es,
    callbacks=[metrics_callback]
)

# Entrenamos el modelo
print('\n* TRAIN *\n')
trainer_es.train()

# Evaluamos el modelo
print('\n* EVALUATE *\n')
trainer_es.evaluate(test_es)

# Guardamos las métricas
metrics_callback.save_to_excel()

In [None]:
# Guardamos el modelo y el tokenizador
model_name = 'saved_bert_model_task1_es'
model_es.save_pretrained(model_name)
tokenizer_es.save_pretrained(model_name)

In [None]:

metrics_callback = SaveMetricsCallback(file_name='training_metrics_bert_t1_tr')

trainer_tr = Trainer(
    model=model_tr,
    args=training_args,
    compute_metrics=metrics,
    train_dataset=train_tr,
    eval_dataset=validation_tr,
    callbacks=[metrics_callback]
)
# Train the model
print('\n* TRAIN *\n')
trainer_tr.train()

# Evaluate the model
print('\n* EVALUATE *\n')
trainer_tr.evaluate(test_tr)

# Save metrics to Excel
metrics_callback.save_to_excel()

In [None]:
# Guardamos el modelo y el tokenizador
model_name = 'saved_bert_model_task1_tr'
model_tr.save_pretrained(model_name)
tokenizer_tr.save_pretrained(model_name)

### task 2

## Cargamos los datos

In [None]:
# english
train_load_en = load_data("EXIST_training.tsv", 'task2', 'english')
test_load_en = load_data("EXIST_test_labeled.tsv", 'task2', 'english')
#spanish
train_load_es = load_data("EXIST_training.tsv", 'task2', 'spanish')
test_load_es = load_data("EXIST_test_labeled.tsv",  'task2', 'spanish')
#turkish
df_exist_turkish = load_turkish_data("Dataset_Sexism_Turkish.csv",  'task2', 'turkish')

In [None]:
#separamos el dataset de seda en training y test set
train_load_tr, test_load_tr = train_test_split(df_exist_turkish, test_size=0.4, random_state=42)

In [None]:
train_load_en , validation_load_en  = train_test_split(train_load_en , test_size=0.2, random_state=42) #english
train_load_es , validation_load_es  = train_test_split(train_load_es , test_size=0.2, random_state=42) #spanish
train_load_tr , validation_load_tr  = train_test_split(train_load_tr , test_size=0.2, random_state=42) #turkish

In [None]:
#english
train_en = formatting_data(train_load_en, 'english')
test_en  = formatting_data(train_load_en, 'english' )
validation_en = formatting_data(validation_load_en, 'english' )

#spanish
train_es = formatting_data(train_load_es, 'spanish' )
test_es  = formatting_data(test_load_es, 'spanish' )
validation_es = formatting_data(validation_load_es, 'spanish' )

#turkish
train_tr = formatting_data(train_load_tr, 'turkish' )
test_tr  = formatting_data(test_load_tr, 'turkish' )
validation_tr = formatting_data(validation_load_tr, 'turkish' )

In [None]:
training_args_3 = TrainingArguments(
    output_dir='RESULTADOS',
    evaluation_strategy='epoch',
    logging_strategy='steps', 
    save_strategy='epoch',
    learning_rate=2e-5, 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32, 
    num_train_epochs=5, 
    weight_decay=0.1, 
    logging_dir='LOGS',
    logging_steps=100, 
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    save_total_limit=3,
    gradient_accumulation_steps=2,
)

In [None]:

metrics_callback = SaveMetricsCallback(file_name='training_metrics_bert_t2_en')

trainer_en = Trainer(
    model=model_en,
    args=training_args_3,
    compute_metrics=metrics,
    train_dataset=train_en,
    eval_dataset=validation_en,
    callbacks=[metrics_callback]
)

# Entrenamos el modelo
print('\n* TRAIN *\n')
trainer_en.train()

# Evaluamos el modelo
print('\n* EVALUATE *\n')
trainer_en.evaluate(test_en)

# Guardamos las métricas
metrics_callback.save_to_excel()


In [None]:
# Guardamos el modelo y el tokenizador
model_name = 'saved_bert_model_task2_en'
model_en.save_pretrained(model_name)
tokenizer_en.save_pretrained(model_name)


In [None]:
# Instantiate the metrics callback
metrics_callback = SaveMetricsCallback(file_name='training_metrics_bert_t2_es')

# Instantiate the trainer class
trainer_es = Trainer(
    model=model_es,
    args=training_args_3,
    compute_metrics=metrics,
    train_dataset=train_es,
    eval_dataset=validation_es,
    callbacks=[metrics_callback])


# Entrenamos el modelo
print('\n* TRAIN *\n')
trainer_es.train()

# Evaluamos el modelo
print('\n* EVALUATE *\n')
trainer_es.evaluate(test_es)

# Guardamos las métricas
metrics_callback.save_to_excel()

In [None]:
# Guardamos el modelo y el tokenizador
model_name = 'saved_bert_model_task2_es'
model_es.save_pretrained(model_name)
tokenizer_es.save_pretrained(model_name)

In [None]:
metrics_callback = SaveMetricsCallback(file_name='training_metrics_bert_t2_tr')

trainer_tr = Trainer(
    model=model_tr,
    args=training_args_3,
    compute_metrics=metrics,
    train_dataset=train_tr,
    eval_dataset=validation_tr,
    callbacks=[metrics_callback] )


# Entrenamos el modelo
print('\n* TRAIN *\n')
trainer_tr.train()

# Evaluamos el modelo
print('\n* EVALUATE *\n')
trainer_tr.evaluate(test_tr)

# Guardamos las métricas
metrics_callback.save_to_excel()

In [None]:
# Guardamos el modelo y el tokenizador
model_name = 'saved_bert_model_task2_tr'
model_tr.save_pretrained(model_name)
tokenizer_tr.save_pretrained(model_name)