In [80]:
# !pip install torch torchtext torchvision transformers sentencepiece pandas tqdm datasets

In [81]:
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

In [82]:
# Load data
data_sample = load_dataset("QuyenAnhDE/Diseases_Symptoms")

Repo card metadata block was not found. Setting CardData to empty.


In [83]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

In [84]:
# Convert to a pandas dataframe
updated_data = [{'Name': item['Name'], 'Symptoms': item['Symptoms']} for item in data_sample['train']]
df = pd.DataFrame(updated_data)
df.head()

Unnamed: 0,Name,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [85]:
# Extract the symptoms
df['Symptoms'] = df['Symptoms'].apply(lambda x: ', '.join(x.split(', ')))

In [86]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split

In [87]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [88]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)

In [89]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [90]:
BATCH_SIZE = 8

In [91]:
df.describe()

Unnamed: 0,Name,Symptoms
count,400,400
unique,392,395
top,Sciatica,"Swelling, pain, dry mouth, bad taste"
freq,3,3


In [92]:
# Dataset Prep
class LanguageDataset(Dataset):
    """
    An extension of the Dataset object to:
      - Make training loop cleaner
      - Make ingestion easier from pandas df's
    """
    def __init__(self, df, tokenizer):
        self.labels = df.columns
        self.data = df.to_dict(orient='records')
        self.tokenizer = tokenizer
        x = self.fittest_max_length(df)  # Fix here
        self.max_length = x

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx][self.labels[0]]
        y = self.data[idx][self.labels[1]]
        text = f"{x} | {y}"
        tokens = self.tokenizer.encode_plus(text, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        return tokens

    def fittest_max_length(self, df):  # Fix here
        """
        Smallest power of two larger than the longest term in the data set.
        Important to set up max length to speed training time.
        """
        max_length = max(len(max(df[self.labels[0]], key=len)), len(max(df[self.labels[1]], key=len)))
        x = 2
        while x < max_length: x = x * 2
        return x

# Cast the Huggingface data set as a LanguageDataset we defined above
data_sample = LanguageDataset(df, tokenizer)

In [93]:
data_sample = LanguageDataset(df, tokenizer)
data_sample

<__main__.LanguageDataset at 0x7e120c9b1b50>

In [94]:
train_size = int(0.8 * len(data_sample))
val_size = len(data_sample) - train_size

train_data, val_data = random_split(data_sample, [train_size, val_size])

In [95]:
train_loader = DataLoader(train_data, batch_size = BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size = BATCH_SIZE)

In [96]:
num_epochs = 8

In [97]:
batch_size = BATCH_SIZE
model_name = 'distilgpt2'
gpu = 0

In [98]:
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = optim.Adam(params = model.parameters(), lr=5e-4)
tokenizer.pad_token = tokenizer.eos_token

In [99]:
results = pd.DataFrame(columns=['epochs', 'transformer', 'batch_size', 'gpu', 'training_loss', 'validation_loss', 'epoch_duration_sec'])

In [100]:
# The training loop
for epoch in range(num_epochs):
    start_time = time.time()  # Start the timer for the epoch

    # Training
    ## This line tells the model we're in 'learning mode'
    model.train()
    epoch_training_loss = 0
    train_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs} Batch Size: {batch_size}, Transformer: {model_name}")
    for batch in train_iterator:
        optimizer.zero_grad()
        inputs = batch['input_ids'].squeeze(1).to(device)
        targets = inputs.clone()
        outputs = model(input_ids=inputs, labels=targets)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_iterator.set_postfix({'Training Loss': loss.item()})
        epoch_training_loss += loss.item()
    avg_epoch_training_loss = epoch_training_loss / len(train_iterator)

    # Validation
    ## This line below tells the model to 'stop learning'
    model.eval()
    epoch_validation_loss = 0
    total_loss = 0
    valid_iterator = tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{num_epochs}")
    with torch.no_grad():
        for batch in valid_iterator:
            inputs = batch['input_ids'].squeeze(1).to(device)
            targets = inputs.clone()
            outputs = model(input_ids=inputs, labels=targets)
            loss = outputs.loss
            total_loss += loss
            valid_iterator.set_postfix({'Validation Loss': loss.item()})
            epoch_validation_loss += loss.item()

    avg_epoch_validation_loss = epoch_validation_loss / len(val_loader)

    end_time = time.time()  # End the timer for the epoch
    epoch_duration_sec = end_time - start_time  # Calculate the duration in seconds

    new_row = {'transformer': model_name,
               'batch_size': batch_size,
               'gpu': gpu,
               'epoch': epoch+1,
               'training_loss': avg_epoch_training_loss,
               'validation_loss': avg_epoch_validation_loss,
               'epoch_duration_sec': epoch_duration_sec}  # Add epoch_duration to the dataframe

    results.loc[len(results)] = new_row
    print(f"Epoch: {epoch+1}, Validation Loss: {total_loss/len(val_loader)}")

Training Epoch 1/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.65it/s, Training Loss=0.63]
Validation Epoch 1/8: 100%|██████████| 10/10 [00:00<00:00, 16.54it/s, Validation Loss=0.546]


Epoch: 1, Validation Loss: 0.6696531772613525


Training Epoch 2/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.79it/s, Training Loss=0.666]
Validation Epoch 2/8: 100%|██████████| 10/10 [00:00<00:00, 16.38it/s, Validation Loss=0.522]


Epoch: 2, Validation Loss: 0.6301171183586121


Training Epoch 3/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.85it/s, Training Loss=0.363]
Validation Epoch 3/8: 100%|██████████| 10/10 [00:00<00:00, 16.96it/s, Validation Loss=0.536]


Epoch: 3, Validation Loss: 0.6616062521934509


Training Epoch 4/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.90it/s, Training Loss=0.36]
Validation Epoch 4/8: 100%|██████████| 10/10 [00:00<00:00, 17.07it/s, Validation Loss=0.596]


Epoch: 4, Validation Loss: 0.7047935128211975


Training Epoch 5/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.94it/s, Training Loss=0.296]
Validation Epoch 5/8: 100%|██████████| 10/10 [00:00<00:00, 16.80it/s, Validation Loss=0.569]


Epoch: 5, Validation Loss: 0.7216604948043823


Training Epoch 6/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.95it/s, Training Loss=0.302]
Validation Epoch 6/8: 100%|██████████| 10/10 [00:00<00:00, 17.15it/s, Validation Loss=0.684]


Epoch: 6, Validation Loss: 0.7995710968971252


Training Epoch 7/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.93it/s, Training Loss=0.262]
Validation Epoch 7/8: 100%|██████████| 10/10 [00:00<00:00, 16.93it/s, Validation Loss=0.653]


Epoch: 7, Validation Loss: 0.8136942982673645


Training Epoch 8/8 Batch Size: 8, Transformer: distilgpt2: 100%|██████████| 40/40 [00:08<00:00,  4.93it/s, Training Loss=0.18]
Validation Epoch 8/8: 100%|██████████| 10/10 [00:00<00:00, 16.62it/s, Validation Loss=0.72]

Epoch: 8, Validation Loss: 0.8820013403892517





In [112]:
input_string = "Lung Cancer"
input_string

'Lung Cancer'

In [113]:
input_ids = tokenizer.encode(input_string, return_tensors='pt').to(device)
input_ids

tensor([[   43,  2150, 15523]], device='cuda:0')

In [114]:
output = model.generate(
    input_ids,             # Tokenized input sequence to start generation
    max_length=20,         # Maximum length of generated text (including input)
    num_return_sequences=1,# Number of sequences to generate
    do_sample=True,        # Enables sampling instead of greedy decoding
    top_k=8,               # Considers only the top 8 most probable tokens
    top_p=0.95,            # Nucleus sampling: selects tokens covering 95% probability mass
    temperature=0.5,       # Controls randomness (lower = more deterministic, higher = more diverse)
    repetition_penalty=1.2 # Penalizes repetition to avoid duplicate words/phrases
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [115]:
output

tensor([[   43,  2150, 15523,   930, 12301, 15212,    11, 10453,   393, 14005,
          4168,    11, 10792, 16079, 50256]], device='cuda:0')

In [116]:
output[0]

tensor([   43,  2150, 15523,   930, 12301, 15212,    11, 10453,   393, 14005,
         4168,    11, 10792, 16079, 50256], device='cuda:0')

In [117]:
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

Lung Cancer | Fatigue, weakness or pale skin, frequent infections


In [121]:
torch.save(model, '/content/drive/MyDrive/Colab Notebooks/LLms/small-disease-lm.pt')