In [22]:
import os
from time import time
from pathlib import Path
import pandas as pd 
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import Dataset
import accelerate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [23]:
# Obtenemos la carpeta actual
current_dir = Path.cwd()
DATASETS_LOCATION = os.path.join(current_dir.parent.parent.parent, 'datasets')
MODELS_LOCATION = os.path.join(current_dir.parent.parent, 'models')

Cargamos el dataset:

In [24]:
df_all_languages = pd.read_csv(os.path.join(DATASETS_LOCATION, 'language_detection.csv'))

# Filtramos por idioma español e inglés, que son los que nos interesan
df_spanish_english = df_all_languages[df_all_languages['Language'].isin(['Spanish', 'English'])]

# Mostrar el DataFrame filtrado
df_spanish_english.head(3)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English


In [25]:
print("Dataset Language distribution antes de undersampling:")
print(df_spanish_english['Language'].value_counts())

# Determinamos la cantidad mínima de ejemplos por clase
min_count = min(df_spanish_english['Language'].value_counts())

# Submuestreamos para equilibrar las clases
df_balanced = pd.concat([
    df_spanish_english[df_spanish_english['Language'] == 'English'].sample(n=min_count, random_state=42),
    df_spanish_english[df_spanish_english['Language'] == 'Spanish']
])

# Verificamos el equilibrio
print("\n Dataset Language distribution despues de undersampling:")
print(df_balanced['Language'].value_counts())

Dataset Language distribution antes de undersampling:
Language
English    1385
Spanish     819
Name: count, dtype: int64

 Dataset Language distribution despues de undersampling:
Language
English    819
Spanish    819
Name: count, dtype: int64


Dividimos el dataset en conjunto de entrenamiento y prueba:

In [26]:
# Dividiamo il dataset bilanciato in train (80%) e test (20%)
train_df, test_df = train_test_split(df_balanced, test_size=0.2, random_state=42, stratify=df_balanced['Language'])

# Dividiamo ulteriormente il train in train (80%) e validation (20%)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['Language'])

# Verifichiamo che i set siano equilibrati
print("\nConjunto de entrenamiento:")
print(train_df['Language'].value_counts())

print("\nConjunto de validación:")
print(val_df['Language'].value_counts())

print("\nConjunto de prueba:")
print(test_df['Language'].value_counts())



Conjunto de entrenamiento:
Language
Spanish    524
English    524
Name: count, dtype: int64

Conjunto de validación:
Language
Spanish    131
English    131
Name: count, dtype: int64

Conjunto de prueba:
Language
English    164
Spanish    164
Name: count, dtype: int64


In [27]:
# Crear un codificador de etiquetas para convertir las clases de texto en números
label_encoder = LabelEncoder()

# Convertir las clases de 'Language' a números
train_df['label'] = label_encoder.fit_transform(train_df['Language'])
val_df['label'] = label_encoder.transform(val_df['Language'])
test_df['label'] = label_encoder.transform(test_df['Language'])

train_df.head()

Unnamed: 0,Text,Language,label
4853,El proyecto tuvo el apoyo económico de la empr...,Spanish,1
177,"In Linnaeus' system, these became the kingdoms...",English,0
1008,It is a powerful tool we are only just beginni...,English,0
5163,Por lo tanto es un proceso de inducción del co...,Spanish,1
413,"Notably, the results of a Wikimedia Foundation...",English,0


## Entrenamiento

In [28]:
# Definimos la función de métricas
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds, average='macro'),
        "recall": recall_score(p.label_ids, preds, average='macro'),
        "f1": f1_score(p.label_ids, preds, average='macro'),
    }

In [29]:
# Definimos el modelo y el tokenizer
model_checkpoint = "distilbert-base-multilingual-cased"  # Usaremos el modelo preentrenado BERT
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Preprocesamos los datos
def preprocess_function(examples):
    return tokenizer(examples['Text'], examples['Language'], padding=True, truncation=True, return_tensors="pt")

In [31]:
# Converti i DataFrame in dataset di Hugging Face
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(val_df)
# Tokenizamos el dataset
train_preprocessed_dataset = train_dataset.map(preprocess_function, batched=True)
test_preprocessed_dataset = test_dataset.map(preprocess_function, batched=True)
val_preprocessed_dataset = validation_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1048 [00:00<?, ? examples/s]

Map:   0%|          | 0/328 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

In [32]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,              # Tasa de aprendizaje: se ha probado 5e-6, 1e-5, 5e-5
    per_device_train_batch_size=32,   # Tamaño del batch para entrenamiento: se ha probado 8, 16, 32, 64
    per_device_eval_batch_size=32,    # Tamaño del batch para evaluación
    num_train_epochs=3,
    weight_decay=0.3,               # Decaimiento de peso: hemos probado 0.01, 0.1, 0.2 e 0.3 e 0.4 y este era el mejor
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_preprocessed_dataset,
    eval_dataset=val_preprocessed_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics   # Función para calcular las métricas
)

  trainer = Trainer(


## Training

In [33]:
train_preprocessed_dataset

Dataset({
    features: ['Text', 'Language', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 1048
})

In [34]:
# No modificar esta celda
# Esta celda, celda tiene que estar ejecutada en la entrega

start = time()

trainer.train()

end = time()
print(f">>>>>>>>>>>>> elapsed time: {(end-start)/60:.0f}m")

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.19888177514076233, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 22.6454, 'eval_samples_per_second': 11.57, 'eval_steps_per_second': 0.397, 'epoch': 1.0}


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.024797309190034866, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 11.9686, 'eval_samples_per_second': 21.891, 'eval_steps_per_second': 0.752, 'epoch': 2.0}


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.016829293221235275, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 10.9713, 'eval_samples_per_second': 23.88, 'eval_steps_per_second': 0.82, 'epoch': 3.0}
{'train_runtime': 631.7735, 'train_samples_per_second': 4.976, 'train_steps_per_second': 0.157, 'train_loss': 0.2066112287116773, 'epoch': 3.0}
>>>>>>>>>>>>> elapsed time: 11m


## Evaluación

In [38]:
# Evaluación del modelo en el conjunto de prueba

# Preprocesamos el conjunto de prueba
test_preprocessed_dataset = test_dataset.map(preprocess_function, batched=True)

# Evaluamos el modelo en el conjunto de prueba
results = trainer.evaluate(test_preprocessed_dataset)

# Imprimimos las métricas de evaluación
print("Resultados de la evaluación en el conjunto de prueba:")
print(f"Exactitud: {results['eval_accuracy']:.4f}")
print(f"Precisión: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1-Score: {results['eval_f1']:.4f}")

# Para obtener más detalles sobre las predicciones y las métricas, también puedes obtener las predicciones:
# Realizamos predicciones en el conjunto de prueba
predictions = trainer.predict(test_preprocessed_dataset)

# Calculamos las métricas manualmente si lo deseas
preds = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# Exactitud
accuracy = accuracy_score(y_true, preds)
print(f"Exactitud calculada manualmente: {accuracy:.4f}")

# Precisión, recall y F1-score con el promedio macro
precision = precision_score(y_true, preds, average='macro')
recall = recall_score(y_true, preds, average='macro')
f1 = f1_score(y_true, preds, average='macro')

print(f"Precisión calculada manualmente: {precision:.4f}")
print(f"Recall calculado manualmente: {recall:.4f}")
print(f"F1-Score calculado manualmente: {f1:.4f}")


Map:   0%|          | 0/328 [00:00<?, ? examples/s]

  0%|          | 0/11 [00:00<?, ?it/s]

Resultados de la evaluación en el conjunto de prueba:
Exactitud: 1.0000
Precisión: 1.0000
Recall: 1.0000
F1-Score: 1.0000


  0%|          | 0/11 [00:00<?, ?it/s]

Exactitud calculada manualmente: 1.0000
Precisión calculada manualmente: 1.0000
Recall calculado manualmente: 1.0000
F1-Score calculado manualmente: 1.0000


In [39]:
# Salviamo il modello e il tokenizer
save_directory = os.path.join(MODELS_LOCATION, 'languiage_detection')
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"Modello e tokenizer salvati nella directory {save_directory}")

Modello e tokenizer salvati nella directory c:\Users\maria\Desktop\universidad\master\TFM\tfm\src\models\languiage_detection


In [40]:
# Definiamo alcune frasi di esempio per il test che possano confondere il modello
sample_sentences = [
    {"Text": "Do you want tortilla for dinner?", "Language": "English"},
    {"Text": "Este es un ejemplo de software developement.", "Language": "Spanish"},
    {"Text": "Soy un cloud engineer, trabajo en google", "Language": "Spanish"}
]

# Convertiamo le frasi di esempio in un DataFrame
sample_df = pd.DataFrame(sample_sentences)

# Tokenizziamo le frasi di esempio
sample_dataset = Dataset.from_pandas(sample_df)
sample_preprocessed_dataset = sample_dataset.map(preprocess_function, batched=True)

# Effettuiamo le predizioni
sample_predictions = trainer.predict(sample_preprocessed_dataset)
sample_preds = np.argmax(sample_predictions.predictions, axis=1)
sample_df["Predicted Language"] = label_encoder.inverse_transform(sample_preds)

# Mostriamo i risultati
print("Risultati delle predizioni sulle frasi di esempio:")
sample_df

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Risultati delle predizioni sulle frasi di esempio:


Unnamed: 0,Text,Language,Predicted Language
0,Do you want tortilla for dinner?,English,English
1,Este es un ejemplo de software developement.,Spanish,Spanish
2,"Soy un cloud engineer, trabajo en google",Spanish,Spanish


Abbiamo ri-addestrato un modello gia' capace di fare language classification sui nostri dati che contenevano solo spagnolo e inglese. ci aspettavamo dei risultati molto buoni. dato che questa non e' la parte focale del progetto utilizzeremo questo modello 