In [1]:
!pip install transformers datasets evaluate sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import datasets
from datasets import load_dataset
### Load CSV here

#dataset = load_dataset("csv", data_files={"train": '/content/full_wiki_train.csv', 
                                          #"test": '/content/full_wiki_test.csv'})
                                          
dataset = load_dataset("csv", data_files="full_wiki.csv")
dataset = dataset["train"].map(lambda ex, i: {"id": i, "translation": dict(ex)}, remove_columns=["en", "simple"], features=datasets.Features({"id": datasets.Value("string"), "translation": datasets.Translation(languages=["en", "simple"])}), with_indices=True,)
dataset = dataset.train_test_split(test_size=0.2)
#### Train_test_split CSV here

#sample: wiki_terms = wiki_terms["name_of_dataframe"].train_test_split(test_size=0.2) 





  0%|          | 0/1 [00:00<?, ?it/s]



In [3]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
#model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
source_lang = "en"
target_lang = "simple"
prefix = "translate English to Simple English: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=32, truncation=True)
    return model_inputs

In [5]:
## Apply preprocessing to entire function

tokenized_terms = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [6]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [7]:
!pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import evaluate

metric = evaluate.load("sacrebleu")
#metric = evaluate.load("wer")
#metric = evaluate.load("cer")



In [9]:
## Functions to predict BLEU score

import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [10]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [11]:
## Trains model

training_args = Seq2SeqTrainingArguments(
    output_dir="zach_and_nicks_medical_text_simplifier",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_terms["train"],
    eval_dataset=tokenized_terms["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,3.316295,0.7367,19.0
2,No log,3.256357,2.1681,18.9524
3,No log,3.20267,2.53,18.9524
4,No log,3.158808,3.5731,18.9524
5,No log,3.1241,4.5659,18.9524
6,No log,3.09744,6.4076,18.9524
7,No log,3.075872,6.3626,18.8095
8,No log,3.057129,6.3354,18.8095
9,No log,3.042118,6.3354,18.8095
10,No log,3.029247,6.3354,18.8095


TrainOutput(global_step=150, training_loss=2.9572127278645834, metrics={'train_runtime': 42.5847, 'train_samples_per_second': 56.358, 'train_steps_per_second': 3.522, 'total_flos': 20301270220800.0, 'train_loss': 2.9572127278645834, 'epoch': 30.0})

In [12]:
trainer.save_model("Trained_model_1")


In [13]:
#trainer.push_to_hub()

In [19]:
## Inference stage

# from transformers import AutoConfig
# config = AutoConfig.from_pretrained('t5-small')


text = "translate English to Simple English: A 39-year-old man was hospitalized due to an increasingly reduced general health condition, after persistent fever and dry cough for 2 weeks. The patient initially needed 4 L/min of oxygen, had a rapid and shallow breathing pattern at rest and became severely breathless during minor physical activities. In the beginning, physical therapy focused on patient education about dyspnea-relieving positions, the importance of regular mobilization, and deep-breathing exercises. However, it quickly became evident that his anxiety from fear of dying and worries about his future aggravated his dyspnea and vice versa. The patient was so dyspneic, anxious, and weak that he was barely able to walk to the toilet. To counter this vicious circle, the physical therapist actively listened to the patient, explained why he was experiencing breathlessness, and tested suitable positions to relieve his dyspnea. He seemed to benefit from the education and the relaxing breathing exercises, as seen on day 2, when his respiratory rate could be reduced from 30 breaths/min to 22 breaths/min and his oxygen saturation increased from 92% to 96% on 4 L/min oxygen after guiding him through some deep-breathing exercises. Over the next days, his dyspnea and anxiety started to alleviate and he regained his self-confidence. Therapy was progressively shifted to walking and strength training and the patient rapidly advanced to walk 350 m without a walking aid or supplemental oxygen before his discharge home."

## Can then use a pipline() like below:

from transformers import pipeline

translator = pipeline("translation", model="Trained_model_1", max_length=200)
translator(text)



Your input_length: 338 is bigger than 0.9 * max_length: 200. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


[{'translation_text': 'A 39-year-old man was hospitalized due to an increasingly reduced general health condition, after persistent fever and dry cough for 2 weeks. The patient had a rapid and shallow breathing pattern at rest and became severely breathless during minor physical activities. In the beginning, physical therapy focused on patient education about dyspnea-relieving positions, the importance of regular mobilization, and deep-breathing exercises.'}]

In [15]:
text = "translate English to Simple English: This 52-year-old male tested COVID-19 positive 4 days after the beginning of a dry cough, fever, and head and limb pain. One day later, he was hospitalized with exertional dyspnea. He was diagnosed with pneumonia that developed into moderate ARDS needing mechanical ventilation and intermittent dialysis. After extubation, oxygenation was stable with 2 to 3 L/min of oxygen. However, the patient was disoriented and could not communicate verbally. His global weakness (CPAx 11/50) was accompanied by oral and pharyngeal weakness and paresthesia. Spontaneous swallowing frequency and tongue control were severely reduced, and the patient showed insufficient protection from aspiration. This was confirmed by a specialized physical therapist with the Gugging Swallowing Screen, which confirmed severe dysphagia with 2/20 points. He was treated nil by mouth and received dysphagia therapy such as intensive oral stimulation, facilitation of swallowing, and training of protection mechanisms. After initial agitation and disorientation, the patient started to communicate in single-word phrases, but dysphagia continued to be severe with massive oral and pharyngeal dry saliva residuals that compromised his paresthesia and required regular mouth care. Over the next days, the patient managed to swallow pure\u00e9d food and mildly thick fluids under supervision, although cough strength was still weak (Gugging Swallowing Screen 13/20, CPAx 30/50). Nevertheless, he continued to progress and became capable of independent food ingestion (Gugging Swallowing Screen 20/20, CPAx 39/50) before his discharge to a rehabilitation clinic 25 days after admission."

## Can then use a pipline() like below:

from transformers import pipeline

translator = pipeline("translation", model="Trained_model_1", max_length=400)
translator(text)


Your input_length: 385 is bigger than 0.9 * max_length: 400. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


[{'translation_text': 'This 52-year-old male tested COVID-19 positive 4 days after the beginning of a dry cough, fever, and head and limb pain. One day later, he was hospitalized with exertional dyspnea. He was diagnosed with pneumonia that developed into moderate ARDS needing mechanical ventilation and intermittent dialysis. After extubation, oxygenation was stable with 2 to 3 L/min of oxygen. However, the patient was disoriented and could not communicate verbally. His global weakness (CPAx'}]