### Weakly supervised finetuning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('pseudo_labeled_summaries.csv')

# Drop rows with missing combined_text or generated_summary
df = df.dropna(subset=['combined_text', 'generated_summary', 'subject_id'])

df_sample = df.sample(n=10000, random_state=5230)

# Train-test split
train_df, test_df = train_test_split(df_sample, test_size=0.2, random_state=42)

train_df.to_csv('t5_train.csv', index=False)
test_df.to_csv('t5_test.csv', index=False)

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

# Load train dataset
train_dataset = Dataset.from_pandas(train_df)

# Preprocessing function
def preprocess(example):
    input_text = "summarize: " + example['combined_text']
    target_text = example['generated_summary']
    inputs = tokenizer(input_text, max_length=512, padding="max_length", truncation=True)
    targets = tokenizer(target_text, max_length=128, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Training arguments
training_args = TrainingArguments(
    output_dir="./t5_small_finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="no",
    logging_dir="./logs",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train and save
trainer.train()
model.save_pretrained("./t5_small_finetuned")
tokenizer.save_pretrained("./t5_small_finetuned")




Map:   0%|          | 0/800 [00:00<?, ? examples/s]


KeyError: 'generated_summary'

In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,  Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("./t5_small_finetuned")
model = AutoModelForSeq2SeqLM.from_pretrained("./t5_small_finetuned")
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load and convert the test data
test_df = pd.read_csv("t5_test.csv")
test_dataset = Dataset.from_pandas(test_df)

# Generate summaries
def generate_summary(example):
    input_text = "summarize: " + example['combined_text']
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return {"generated_t5_summary": summary}

# Apply the function and save the output
generated = test_dataset.map(generate_summary)
generated_df = pd.DataFrame(generated)
generated_df.columns
# generated_df.to_csv("t5_finetuned_summary.csv", index=False)

Map: 100%|██████████| 2000/2000 [21:19<00:00,  1.56 examples/s]


Index(['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq',
       'charttime', 'storetime', 'Examination', 'Indication', 'Technique',
       'Comparison', 'Findings', 'Impression', 'tokenized_text', 'Gender',
       'combined_text_clean', 'combined_text', 'generated_summary',
       'generated_t5_summary'],
      dtype='object')

In [17]:
from textstat import flesch_kincaid_grade, dale_chall_readability_score

# Ensure the correct DataFrame and column are used
test_df = pd.read_csv("t5_finetuned_summary.csv")
test_df = test_df.dropna(subset=["generated_t5_summary"])

# Calculate readability scores
fk_scores = test_df["generated_summary"].apply(flesch_kincaid_grade)

# Print average scores
print("Average Flesch-Kincaid Grade Level:", fk_scores.mean())

Average Flesch-Kincaid Grade Level: 10.745


In [5]:
import evaluate
import pandas as pd

# Load the TF-IDF summary dataframe
df = pd.read_csv("t5_finetuned_summary.csv")

# Load the true labels dataframe
true_labels = pd.read_csv("1000 labels.csv")

# Merge on subject_id to align the tfidf_summary with true labels
merged = df.merge(true_labels[["subject_id", "labels"]], on="subject_id", how="inner")

# Initialize ROUGE scorer
rouge = evaluate.load("rouge")

# Compute ROUGE scores
results = rouge.compute(
    predictions=merged["generated_t5_summary"].astype(str).tolist(),
    references=merged["labels"].astype(str).tolist()
)

# Display ROUGE scores
for k, v in results.items():
    print(f"{k}: {v:.4f}")


rouge1: 0.4080
rouge2: 0.2151
rougeL: 0.3206
rougeLsum: 0.3220


### Finetuning with generated labels

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
! pip install datasets
from datasets import Dataset
from sklearn.metrics import f1_score

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
import pandas as pd

# Load data
df = pd.read_csv("1000 labels.csv")

# Train-test split
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=5230)

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

print(train_df.shape, test_df.shape)


(800, 19) (200, 19)


In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import f1_score

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the manually labeled data
labels_df = pd.read_csv("1000 labels.csv")[['combined_text', 'labels']]

# Split into training and testing sets
train_df = labels_df[:800]
test_df = labels_df[800:]

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Batched preprocessing function
def preprocess_function(batch):
    input_texts = ["summarize: " + str(text) for text in batch["combined_text"]]
    target_texts = [str(label) for label in batch["labels"]]

    inputs = tokenizer(input_texts, max_length=512, truncation=True, padding='max_length')
    targets = tokenizer(target_texts, max_length=128, truncation=True, padding='max_length')

    inputs["labels"] = targets["input_ids"]
    return inputs

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["combined_text", "labels"])
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["combined_text", "labels"])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5_manual_finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    eval_strategy='epoch',  # Use 'eval_strategy' not 'evaluation_strategy'
    save_strategy='epoch',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss"
)

# Dummy metrics (T5 outputs sequences, not class labels, so f1 isn't directly meaningful unless post-processed)
def compute_metrics(p):
    return {}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33me1349639[0m ([33me1349639-national-university-of-singapore-students-union[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.7116,0.565392
2,0.5664,0.486743
3,0.4898,0.435095
4,0.4492,0.410589
5,0.434,0.408219


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=500, training_loss=0.9301867065429688, metrics={'train_runtime': 265.5149, 'train_samples_per_second': 15.065, 'train_steps_per_second': 1.883, 'total_flos': 541367205888000.0, 'train_loss': 0.9301867065429688, 'epoch': 5.0})

In [3]:
model.save_pretrained("./SMALL FINETUNED")
tokenizer.save_pretrained("./SMALL FINETUNED")

('./SMALL FINETUNED/tokenizer_config.json',
 './SMALL FINETUNED/special_tokens_map.json',
 './SMALL FINETUNED/spiece.model',
 './SMALL FINETUNED/added_tokens.json',
 './SMALL FINETUNED/tokenizer.json')

In [4]:
# First, reload the original test_df for reference (not tokenized)
test_df = labels_df[800:].reset_index(drop=True)

# Move model to evaluation mode and device
model.eval()
model.to(device)

# Prepare input texts
input_texts = ["summarize: " + str(text) for text in test_df["combined_text"]]

# Tokenize input texts
inputs = tokenizer(
    input_texts,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
)

# Move tensors to GPU if available
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate summaries
with torch.no_grad():
    summaries = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=128,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

# Decode summaries
decoded_summaries = tokenizer.batch_decode(summaries, skip_special_tokens=True)

# Add summaries to dataframe
test_df["generated_t5_summary"] = decoded_summaries

# Save to CSV
test_df.to_csv("t5_finetuned_test_summaries.csv", index=False)

# Preview result
print(test_df[["combined_text", "labels", "generated_t5_summary"]].head())

                                       combined_text  \
0  Helical MDCT images were acquired of the abdom...   
1  Sagittal pre- and post-gadolinium T1, T2, STIR...   
2  male with markedly elevated leukocyte count (4...   
3  ART  year old woman with sp SMA stent via righ...   
4  woman status post fall with left arm pain.  No...   

                                              labels  \
0  CT abdomen and pelvis revealed bibasilar atele...   
1  MRI shows grade II anterolisthesis of L5 on S1...   
2  A male with leukocytosis (41K) underwent contr...   
3  An ART patient with prior SMA stenting via rig...   
4  Female patient post-fall presented with left a...   

                                generated_t5_summary  
0  LUNG BASES: Probable gallstone lodged in the g...  
1  Sagittal views demonstrate grade II anterolist...  
2  A male with markedly elevated leukocyte count ...  
3  Occlusion of the common femoral artery with re...  
4                                                  

In [16]:
from textstat import flesch_kincaid_grade, dale_chall_readability_score

# Ensure the correct DataFrame and column are used
test_df = pd.read_csv("finetuned_labelled_t5.csv")
test_df = test_df.dropna(subset=["generated_t5_summary"])

# Calculate readability scores
fk_scores = test_df["generated_t5_summary"].apply(flesch_kincaid_grade)

# Print average scores
print("Average Flesch-Kincaid Grade Level:", fk_scores.mean())

Average Flesch-Kincaid Grade Level: 11.126785714285713


In [18]:
rouge = evaluate.load("rouge")

# Compute ROUGE scores
results = rouge.compute(
    predictions=df["generated_t5_summary"].astype(str).tolist(),
    references=df["labels"].astype(str).tolist()
)

# Display ROUGE scores
for k, v in results.items():
    print(f"{k}: {v:.4f}")

rouge1: 0.4792
rouge2: 0.2632
rougeL: 0.4177
rougeLsum: 0.4175
