In [1]:
import os
from time import time
from pathlib import Path
import pandas as pd 
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import Dataset
import accelerate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score




Obtenemos la carpeta actual:

In [2]:
current_dir = Path.cwd()
DATASETS_LOCATION = os.path.join(current_dir.parent.parent.parent, 'datasets')
MODELS_LOCATION = os.path.join(current_dir.parent.parent, 'models')

Cargamos el dataset en un dataframe:

In [3]:
df_disease_sp = pd.read_excel(os.path.join(DATASETS_LOCATION, 'disease_nlp_eng.xlsx'))
df_disease_sp.head(3)

Unnamed: 0,Symptoms,Disease
0,I've been feeling tired all the time and notic...,Anemia
1,"Lately, I’ve been out of breath even after lig...",Anemia
2,"For the past few weeks, I’ve been extremely we...",Anemia


In [4]:
print(df_disease_sp['Disease'].value_counts())

Disease
Anemia         100
Thalassemia    100
Thrombosis     100
Diabetes       100
Name: count, dtype: int64


Observamos que ya las clases están bien distribuidas. Dividimos el dataset en conjunto de entrenamiento y prueba, y a su vez el conjunto de entrenamiento lo dividiremos en conjunto de entrenamiento y conjunto de validación:

In [5]:
train_df, test_df = train_test_split(df_disease_sp, test_size=0.2, random_state=42, stratify=df_disease_sp['Disease'])
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['Disease'])

print("\nConjunto de entrenamiento:")
print(train_df['Disease'].value_counts())

print("\nConjunto de validación:")
print(val_df['Disease'].value_counts())

print("\nConjunto de prueba:")
print(test_df['Disease'].value_counts())


Conjunto de entrenamiento:
Disease
Thrombosis     64
Anemia         64
Thalassemia    64
Diabetes       64
Name: count, dtype: int64

Conjunto de validación:
Disease
Thrombosis     16
Thalassemia    16
Anemia         16
Diabetes       16
Name: count, dtype: int64

Conjunto de prueba:
Disease
Thalassemia    20
Diabetes       20
Anemia         20
Thrombosis     20
Name: count, dtype: int64


Codificamos las clases de 'Disease', utilizando LabelEncoder:

In [6]:
label_encoder = LabelEncoder()

train_df['label'] = label_encoder.fit_transform(train_df['Disease'])
val_df['label'] = label_encoder.transform(val_df['Disease'])
test_df['label'] = label_encoder.transform(test_df['Disease'])

train_df.head()

Unnamed: 0,Symptoms,Disease,label
281,"I’ve noticed my leg is swollen, and the veins ...",Thrombosis,3
22,"I’ve been getting infections a lot, and my ski...",Anemia,0
146,"I’ve been so weak lately, and my skin looks ye...",Thalassemia,2
70,I’ve noticed that the whites of my eyes have a...,Anemia,0
376,"My vision’s been blurry, I’m constantly thirst...",Diabetes,1


## Entrenamiento

Definamos la función de métricas:

In [7]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds, average='macro'),
        "recall": recall_score(p.label_ids, preds, average='macro'),
        "f1": f1_score(p.label_ids, preds, average='macro'),
    }

Definimos el modelo y el tokenizador. En este caso, utilizaremos el modelo preentrenado de hugging face 'Distilbert':

In [8]:
model_checkpoint = "distilbert-base-uncased"  
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Preprocesamos los datos
def preprocess_function(examples):
    return tokenizer(examples['Symptoms'], padding=True, truncation=True, return_tensors="pt")

Convertimos los dataframe en datasets de hugging face, y los tokenizamos:

In [10]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(val_df)

train_preprocessed_dataset = train_dataset.map(preprocess_function, batched=True)
test_preprocessed_dataset = test_dataset.map(preprocess_function, batched=True)
val_preprocessed_dataset = validation_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,              # Tasa de aprendizaje: se ha probado 5e-6, 1e-5, 5e-5, 2e-5
    per_device_train_batch_size=16,   # Tamaño del batch para entrenamiento: se ha probado 8, 16, 32, 64
    per_device_eval_batch_size=16,    # Tamaño del batch para evaluación
    num_train_epochs=7,
    weight_decay=0.2,               # Decaimiento de peso: hemos probado 0.01, 0.03, 0.1, 0.2, 0.3 y 0.4
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_preprocessed_dataset,
    eval_dataset=val_preprocessed_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics   # Función para calcular las métricas
)

  trainer = Trainer(


## Training

Entrenamos el modelo:

In [12]:
start = time()

trainer.train()

end = time()
print(f">>>>>>>>>>>>> elapsed time: {(end-start)/60:.0f}m")

  0%|          | 0/112 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 1.2458349466323853, 'eval_accuracy': 0.75, 'eval_precision': 0.8128582202111614, 'eval_recall': 0.75, 'eval_f1': 0.734778662198017, 'eval_runtime': 0.9042, 'eval_samples_per_second': 70.783, 'eval_steps_per_second': 4.424, 'epoch': 1.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.9477478265762329, 'eval_accuracy': 0.765625, 'eval_precision': 0.7991071428571428, 'eval_recall': 0.765625, 'eval_f1': 0.7502705627705628, 'eval_runtime': 0.9466, 'eval_samples_per_second': 67.607, 'eval_steps_per_second': 4.225, 'epoch': 2.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.682657778263092, 'eval_accuracy': 0.953125, 'eval_precision': 0.9605263157894737, 'eval_recall': 0.953125, 'eval_f1': 0.9527093596059113, 'eval_runtime': 0.9709, 'eval_samples_per_second': 65.92, 'eval_steps_per_second': 4.12, 'epoch': 3.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.5084469318389893, 'eval_accuracy': 0.921875, 'eval_precision': 0.9404761904761905, 'eval_recall': 0.921875, 'eval_f1': 0.91991991991992, 'eval_runtime': 0.8657, 'eval_samples_per_second': 73.927, 'eval_steps_per_second': 4.62, 'epoch': 4.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.40697821974754333, 'eval_accuracy': 0.953125, 'eval_precision': 0.9605263157894737, 'eval_recall': 0.953125, 'eval_f1': 0.9527093596059113, 'eval_runtime': 0.8461, 'eval_samples_per_second': 75.64, 'eval_steps_per_second': 4.727, 'epoch': 5.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.3495428264141083, 'eval_accuracy': 0.953125, 'eval_precision': 0.9605263157894737, 'eval_recall': 0.953125, 'eval_f1': 0.9527093596059113, 'eval_runtime': 0.9089, 'eval_samples_per_second': 70.417, 'eval_steps_per_second': 4.401, 'epoch': 6.0}


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.3341251015663147, 'eval_accuracy': 0.953125, 'eval_precision': 0.9605263157894737, 'eval_recall': 0.953125, 'eval_f1': 0.9527093596059113, 'eval_runtime': 1.0671, 'eval_samples_per_second': 59.977, 'eval_steps_per_second': 3.749, 'epoch': 7.0}
{'train_runtime': 122.919, 'train_samples_per_second': 14.579, 'train_steps_per_second': 0.911, 'train_loss': 0.7061257362365723, 'epoch': 7.0}
>>>>>>>>>>>>> elapsed time: 2m


## Evaluación

Evaluamos el modelo en el conjunto de prueba:

In [13]:
results = trainer.evaluate(test_preprocessed_dataset)

# Imprimimos las métricas de evaluación
print("Resultados de la evaluación en el conjunto de prueba:")
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1-Score: {results['eval_f1']:.4f}")

  0%|          | 0/5 [00:00<?, ?it/s]

Resultados de la evaluación en el conjunto de prueba:
Accuracy: 0.9750
Precision: 0.9773
Recall: 0.9750
F1-Score: 0.9749


Tal y como observamos, se obtiene una accuracy en train de 0.9531, y en test de 0.975. Por lo tanto, nuestro modelo no presenta overfitting. Tal y como vemos, no se alcanza una accuracy de 1. Esto se debe a que las diferentes enfermedades que el modelo clasifica, tienen síntomas comunes entre ellas, o entre algunas de ellas, haciendo difícil para el modelo poder distinguirlas basándose símplemente en los síntomas.

# Guardado del modelo

Guardamos el modelo y el tokenizer:

In [15]:
'''save_directory = os.path.join(MODELS_LOCATION, 'disease_classification_english_nlp')
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"Modelo y tokenizer guardados en el directorio {save_directory}")'''

Modelo y tokenizer guardados en el directorio c:\Users\maria\Desktop\universidad\master\TFM\tfm\src\models\disease_classification_english_nlp


Definamos algunas frases de ejemplo para el test, que puedan confundir a nuestro modelo:

In [15]:
sample_sentences = [
    {"Symptoms": "My vision has been blurry lately, and I feel so drained.", "Disease": "Diabetes"},
    {"Symptoms": "I’ve been feeling unusually weak and cold all the time.", "Disease": "Anemia"},
    {"Symptoms": "There’s a strange heaviness in my leg, and it’s very swollen.", "Disease": "Thrombosis"},
    {"Symptoms": "My skin looks yellowish, and I often feel short of breath.", "Disease": "Thalassemia"},
    {"Symptoms": "I get out of breath even when doing small tasks, and my heart feels like it’s racing.", "Disease": "Anemia"},
    {"Symptoms": "I’ve noticed a lot of pain in my calf, and it seems warmer than usual.", "Disease": "Thrombosis"}
]

# Convertamos las frases de ejemplo en un dataframe
sample_df = pd.DataFrame(sample_sentences)

# Tokenizamos las frases de ejemplo
sample_dataset = Dataset.from_pandas(sample_df)
sample_preprocessed_dataset = sample_dataset.map(preprocess_function, batched=True)

# Efectuemos las predicciones
sample_predictions = trainer.predict(sample_preprocessed_dataset)
sample_preds = np.argmax(sample_predictions.predictions, axis=1)
sample_df["Predicted disease"] = label_encoder.inverse_transform(sample_preds)

print("Resultados de las predicciones en las frases de ejemplo:")
sample_df

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Resultados de las predicciones en las frases de ejemplo:


Unnamed: 0,Symptoms,Disease,Predicted disease
0,"My vision has been blurry lately, and I feel s...",Diabetes,Diabetes
1,I’ve been feeling unusually weak and cold all ...,Anemia,Anemia
2,"There’s a strange heaviness in my leg, and it’...",Thrombosis,Thrombosis
3,"My skin looks yellowish, and I often feel shor...",Thalassemia,Diabetes
4,I get out of breath even when doing small task...,Anemia,Anemia
5,"I’ve noticed a lot of pain in my calf, and it ...",Thrombosis,Thrombosis


# Test para comprobar que el modelo carga correctamente

In [17]:
model_directory = os.path.join(MODELS_LOCATION, 'disease_classification_english_nlp')
# Cargamos el tokenizer y el modelo
tokenizer = AutoTokenizer.from_pretrained(model_directory, local_files_only=True)
model_loaded = AutoModelForSequenceClassification.from_pretrained(model_directory, local_files_only=True)
def classify_disease_symptoms(input_text: str) -> str:
    prediction_mapping = {0: 'Anemia',
                          1: 'Diabetes', 
                          2: 'Thalassemia', 
                          3: 'Thrombosis'}
    # Tokenizamos las frases
    inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
    # Efectuamos las predicciones
    with torch.no_grad():
        outputs = model_loaded(**inputs)
        prediction = torch.argmax(outputs.logits, axis=1).item()
    return prediction_mapping[prediction]

# Añadimos una columna con las predicciones
def add_predictions(df):
    df['Predicted Disease'] = df['Symptoms'].apply(classify_disease_symptoms)
    return df

# Actualizamos el dataframe con las predicciones
predicciones = add_predictions(test_df)
# Extraemos los valores reales y predichos
true_values = predicciones['Disease']
predicted_values = predicciones['Predicted Disease']

# Calculamos la accuracy
accuracy = accuracy_score(true_values, predicted_values)

# Mostramos el resultado
print(f"La accuracy del modelo es: {accuracy:.2f}")


La accuracy del modelo es: 0.97


Se observa que el modelo se ha cargado correctamente y que su rendimiento no se ha alterado.