In [None]:
import os
from time import time
from pathlib import Path
import pandas as pd 
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score




Obtenemos la carpeta actual:

In [2]:
current_dir = Path.cwd()
DATASETS_LOCATION = os.path.join(current_dir.parent.parent.parent, 'datasets')
MODELS_LOCATION = os.path.join(current_dir.parent.parent, 'models')

Cargamos el dataset, y filtramos por los idiomas español e inglés, que son los que nos interesan:

In [3]:
df_all_languages = pd.read_csv(os.path.join(DATASETS_LOCATION, 'language_detection.csv'))

# Filtramos por idioma español e inglés
df_spanish_english = df_all_languages[df_all_languages['Language'].isin(['Spanish', 'English'])]
df_spanish_english.head(3)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English


Como queremos que de ambas clases haya el mismo número de ejemplos, primero determinamos la cantidad de ejemplos que hay de cada clase, para posteriormente equilibrarlas:

In [4]:
print("Dataset Language distribution antes de undersampling:")
print(df_spanish_english['Language'].value_counts())

# Determinamos la cantidad mínima de ejemplos por clase
min_count = min(df_spanish_english['Language'].value_counts())

# Submuestreamos para equilibrar las clases
df_balanced = pd.concat([
    df_spanish_english[df_spanish_english['Language'] == 'English'].sample(n=min_count, random_state=42),
    df_spanish_english[df_spanish_english['Language'] == 'Spanish']
])

# Verificamos el equilibrio
print("\n Dataset Language distribution después de undersampling:")
print(df_balanced['Language'].value_counts())

Dataset Language distribution antes de undersampling:
Language
English    1385
Spanish     819
Name: count, dtype: int64

 Dataset Language distribution después de undersampling:
Language
English    819
Spanish    819
Name: count, dtype: int64


Dividimos el dataset balanceado (que será el dataset que utilizaremos de ahora en adelante), en conjunto de entrenamiento y prueba, y el dataset de entrenamiento en entrenamiento y validación:

In [5]:
train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42, stratify=df_balanced['Language'])
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['Language'])

print("\nConjunto de entrenamiento:")
print(train_df['Language'].value_counts())

print("\nConjunto de validación:")
print(val_df['Language'].value_counts())

print("\nConjunto de prueba:")
print(test_df['Language'].value_counts())


Conjunto de entrenamiento:
Language
Spanish    524
English    524
Name: count, dtype: int64

Conjunto de validación:
Language
Spanish    131
English    131
Name: count, dtype: int64

Conjunto de prueba:
Language
English    164
Spanish    164
Name: count, dtype: int64


Codificamos mediante LabelEncoder las clases de 'Language':

In [6]:
label_encoder = LabelEncoder()

train_df['label'] = label_encoder.fit_transform(train_df['Language'])
val_df['label'] = label_encoder.transform(val_df['Language'])
test_df['label'] = label_encoder.transform(test_df['Language'])

train_df.head()

Unnamed: 0,Text,Language,label
4853,El proyecto tuvo el apoyo económico de la empr...,Spanish,1
177,"In Linnaeus' system, these became the kingdoms...",English,0
1008,It is a powerful tool we are only just beginni...,English,0
5163,Por lo tanto es un proceso de inducción del co...,Spanish,1
413,"Notably, the results of a Wikimedia Foundation...",English,0


## Entrenamiento

Definimos la función de métricas:

In [7]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds, average='macro'),
        "recall": recall_score(p.label_ids, preds, average='macro'),
        "f1": f1_score(p.label_ids, preds, average='macro'),
    }

Y definimos el modelo que vamos a utilizar y el tokenizer:

In [8]:
model_checkpoint = "distilbert-base-multilingual-cased"  # Usaremos el modelo preentrenado distilbert
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocesamos los datos:

In [9]:
def preprocess_function(examples):
    return tokenizer(examples['Text'], padding=True, truncation=True, return_tensors="pt")

Y convertimos el dataframe en un dataset de Hugging Face:

In [10]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(val_df)
# Tokenizamos el dataset
train_preprocessed_dataset = train_dataset.map(preprocess_function, batched=True)
test_preprocessed_dataset = test_dataset.map(preprocess_function, batched=True)
val_preprocessed_dataset = validation_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1048 [00:00<?, ? examples/s]

Map:   0%|          | 0/328 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,              # Tasa de aprendizaje: se ha probado 5e-6, 1e-5, 5e-5
    per_device_train_batch_size=32,   # Tamaño del batch para entrenamiento: se ha probado 8, 16, 32, 64
    per_device_eval_batch_size=32,    # Tamaño del batch para evaluación
    num_train_epochs=3,
    weight_decay=0.3,               # Decaimiento de peso: hemos probado 0.01, 0.1, 0.2 e 0.3 e 0.4 y este era el mejor
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_preprocessed_dataset,
    eval_dataset=val_preprocessed_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics   # Función para calcular las métricas
)

  trainer = Trainer(


## Training

Preprocesamos el conjunto de entrenamiento:

Entrenamos nuestro modelo:

In [12]:
start = time()

trainer.train()

end = time()
print(f">>>>>>>>>>>>> elapsed time: {(end-start)/60:.0f}m")

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.3422802686691284, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 9.0225, 'eval_samples_per_second': 29.038, 'eval_steps_per_second': 0.998, 'epoch': 1.0}


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.05727243050932884, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 9.3172, 'eval_samples_per_second': 28.12, 'eval_steps_per_second': 0.966, 'epoch': 2.0}


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.03310896456241608, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 10.2641, 'eval_samples_per_second': 25.526, 'eval_steps_per_second': 0.877, 'epoch': 3.0}
{'train_runtime': 554.052, 'train_samples_per_second': 5.675, 'train_steps_per_second': 0.179, 'train_loss': 0.2643118193655303, 'epoch': 3.0}
>>>>>>>>>>>>> elapsed time: 9m


## Evaluación

Evaluamos el modelo en el conjunto de prueba:

In [13]:
# Preprocesamos el conjunto de prueba
test_preprocessed_dataset = test_dataset.map(preprocess_function, batched=True)

# Evaluamos el modelo en el conjunto de prueba
results = trainer.evaluate(test_preprocessed_dataset)

# Imprimimos las métricas de evaluación
print("Resultados de la evaluación en el conjunto de prueba:")
print(f"Exactitud: {results['eval_accuracy']:.4f}")
print(f"Precisión: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1-Score: {results['eval_f1']:.4f}")

Map:   0%|          | 0/328 [00:00<?, ? examples/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Resultados de la evaluación en el conjunto de prueba:
Exactitud: 1.0000
Precisión: 1.0000
Recall: 1.0000
F1-Score: 1.0000


# Guardar modelo

Guardamos el modelo y el tokenizer en un directorio:

In [15]:
save_directory = os.path.join(MODELS_LOCATION, 'language_detection')
trainer.model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"Modelo y tokenizer guardados en el directorio {save_directory}")

Modelo y tokenizer guardados en el directorio c:\Users\maria\Desktop\universidad\master\TFM\tfm\src\models\language_detection


Ahora, definamos algunas frases de ejemplo para hacer una prueba, que puedan confundir el modelo:

In [16]:
sample_sentences = [
    {"Text": "Do you want tortilla for dinner?", "Language": "English"},
    {"Text": "Este es un ejemplo de software developement.", "Language": "Spanish"},
    {"Text": "Soy un cloud engineer, trabajo en google", "Language": "Spanish"}
]

# Convirtamos las frases de ejemplo en un dataframe
sample_df = pd.DataFrame(sample_sentences)

# Tokenizamos las frases de ejemplo
sample_dataset = Dataset.from_pandas(sample_df)
sample_preprocessed_dataset = sample_dataset.map(preprocess_function, batched=True)

# Efectuemos las predicciones
sample_predictions = trainer.predict(sample_preprocessed_dataset)
sample_preds = np.argmax(sample_predictions.predictions, axis=1)
sample_df["Predicted Language"] = label_encoder.inverse_transform(sample_preds)

print("Resultados de las predicciones en las frases de ejemplo:")
sample_df

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Resultados de las predicciones en las frases de ejemplo:


Unnamed: 0,Text,Language,Predicted Language
0,Do you want tortilla for dinner?,English,English
1,Este es un ejemplo de software developement.,Spanish,Spanish
2,"Soy un cloud engineer, trabajo en google",Spanish,Spanish


Hemos reentrenado un modelo ya capacitado para realizar clasificación de idiomas con nuestros datos, que contenían solo español e inglés. Esperábamos obtener resultados muy buenos. Dado que esta no es la parte central del proyecto, utilizaremos este modelo.

# Test de carga

In [20]:
model_directory = os.path.join(MODELS_LOCATION, 'language_detection')
# Carica il tokenizer e il modello
tokenizer = AutoTokenizer.from_pretrained(model_directory, local_files_only=True)
model_loaded = AutoModelForSequenceClassification.from_pretrained(model_directory, local_files_only=True)
def classify_disease_symptoms(input_text: str) -> str:
    prediction_mapping = {0: 'English', 1: 'Spanish'}
    # Tokenizza la frase
    inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
    # Effettua la predizione
    with torch.no_grad():
        outputs = model_loaded(**inputs)
        prediction = torch.argmax(outputs.logits, axis=1).item()
    return prediction_mapping[prediction]


# Aggiungi una colonna con le predizioni
def add_predictions(df):
    df['Predicted Language'] = df['Text'].apply(classify_disease_symptoms)
    return df

# Aggiorna il dataframe con le predizioni
predizioni = add_predictions(test_df)
# Estrai i valori reali e predetti
true_values = predizioni['Language']
predicted_values = predizioni['Predicted Language']

# Calcola l'accuracy
accuracy = accuracy_score(true_values, predicted_values)

# Mostra il risultato
print(f"L'accuracy del modello è: {accuracy:.2f}")


L'accuracy del modello è: 1.00
