In [68]:
!pip install torch torchtext transformers sentencepiece pandas tqdm datasets

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [69]:
from datasets import load_dataset,DatasetDict,Dataset
import pandas as pd
import ast
import datasets
from tqdm import tqdm
import time

In [70]:
data_sample=load_dataset("QuyenAnhDE/Diseases_Symptoms")

Repo card metadata block was not found. Setting CardData to empty.


In [71]:
data_sample

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

In [72]:
updated_data=[{'Names':item['Name'],'Symptoms':item['Symptoms']} for item in data_sample['train']]

In [73]:
df=pd.DataFrame(updated_data)

In [74]:
df['Symptoms']=df['Symptoms'].apply(lambda x:','.join(x.split(',')))

In [75]:
df.head(5)

Unnamed: 0,Names,Symptoms
0,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o..."
1,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue"
2,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck..."
3,Cryptorchidism,"Absence or undescended testicle(s), empty scro..."
4,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala..."


In [76]:
from transformers import GPT2Tokenizer,GPT2LMHeadModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset,random_split

In [78]:
if torch.cuda.is_available():
  device=torch.device('cuda')

else:
  try:
    device=torch.device('mps')
  except Exception:
    device=torch.device('cpu')

In [79]:
device


device(type='cuda')

In [80]:
torch.cuda.is_available()

True

In [81]:
tokenizer=GPT2Tokenizer.from_pretrained('distilgpt2')
model=GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)

In [82]:
BATCH_SIZE=8

In [83]:
df.describe()

Unnamed: 0,Names,Symptoms
count,400,400
unique,392,395
top,Sciatica,"Swelling, pain, dry mouth, bad taste"
freq,3,3


In [84]:
class LanguageDataset(Dataset):
  def __init__(self,df,tokenizer):
    self.labels=df.columns
    self.data=df.to_dict(orient='records')
    self.tokenizer=tokenizer
    x=self.fittest_max_length(df)
    self.max_length=x

  def __len__(self):
    return len(self.data)

  def __getitem__(self,idx):
    x=self.data[idx][self.labels[0]]
    y=self.data[idx][self.labels[1]]
    text=f"{x} | {y}"
    tokens=self.tokenizer.encode_plus(text,return_tensors='pt',max_length=128,padding='max_length',truncation=True)
    return tokens

  def fittest_max_length(self,df):

    max_length=max(len(max(df[self.labels[0]],key=len)),len(max(df[self.labels[1]],key=len)))
    x=2
    while x < max_length: x=x*2
    return x

In [85]:
data_sample=LanguageDataset(df,tokenizer)

In [86]:
data_sample

<__main__.LanguageDataset at 0x7f10e8d55e10>

In [87]:
train_size=int(0.8*len(data_sample))
val_size=len(data_sample)-train_size

train_data,val_data=random_split(data_sample,[train_size,val_size])

In [88]:
train_loader=DataLoader(train_data,batch_size=BATCH_SIZE,shuffle=True)
val_loader=DataLoader(val_data,batch_size=BATCH_SIZE)

In [89]:
num_epochs=3

In [90]:
batch_size=BATCH_SIZE
model_name='distilgpt2'
gpu=0

In [91]:
criterion=nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer=optim.Adam(model.parameters(),lr=5e-4)
tokenizer.pad_token=tokenizer.eos_token

In [92]:
result=pd.DataFrame(columns=['epoch','transformers','batch_size','gpu','training_loss','validation_loss','epoch_duration_sec'])

In [93]:
for epoch in range(num_epochs):
  start_time=time.time()
  model.train()
  epoch_training_loss=0
  train_iterator=tqdm(train_loader,desc=f"Training epoch {epoch+1}/{num_epochs} Batch Size: {batch_size},Transformer:{model_name}")
  for batch in train_iterator:
    optimizer.zero_grad()
    inputs=batch['input_ids'].squeeze(1).to(device)
    targets=inputs.clone()
    outputs=model(input_ids=inputs,labels=targets)
    loss=outputs.loss
    loss.backward()
    optimizer.step()
    train_iterator.set_postfix({'Training Loss':loss.item()})
    epoch_training_loss += loss.item()
  avg_epoch_training_loss=epoch_training_loss/len(train_iterator)

  #validation
  model.eval()
  epoch_validation_loss=0
  total_loss=0
  valid_iterator=tqdm(val_loader,desc=f"Validating epoch {epoch+1}/{num_epochs}")
  with torch.no_grad():
    for batch in valid_iterator:
      inputs=batch['input_ids'].squeeze(1).to(device)
      targets=inputs.clone()
      outputs=model(input_ids=inputs,labels=targets)
      loss=outputs.loss
      total_loss += loss
      valid_iterator.set_postfix({'Validation Loss':loss.item()})
      epoch_validation_loss += loss.item()
  avg_epoch_validation_loss=epoch_validation_loss/len(valid_iterator)

  end_time=time.time()
  epoch_duration_sec=end_time - start_time

  new_row={
      'transformer':model_name,
      'batch_size':batch_size,
      'gpu':gpu,
      'training_loss':avg_epoch_training_loss,
      'validation_loss':avg_epoch_validation_loss,
      'epoch_duration_sec':epoch_duration_sec
  }

  result.loc[len(result)]=new_row
  print(f"Epoch: {epoch+1},Validation Loss: {total_loss/len(val_loader)}")

Training epoch 1/3 Batch Size: 8,Transformer:distilgpt2: 100%|██████████| 40/40 [00:02<00:00, 13.58it/s, Training Loss=0.582]
Validating epoch 1/3: 100%|██████████| 10/10 [00:00<00:00, 42.62it/s, Validation Loss=0.624]


Epoch: 1,Validation Loss: 0.6657261252403259


Training epoch 2/3 Batch Size: 8,Transformer:distilgpt2: 100%|██████████| 40/40 [00:02<00:00, 14.04it/s, Training Loss=0.479]
Validating epoch 2/3: 100%|██████████| 10/10 [00:00<00:00, 44.21it/s, Validation Loss=0.55]


Epoch: 2,Validation Loss: 0.6262738704681396


Training epoch 3/3 Batch Size: 8,Transformer:distilgpt2: 100%|██████████| 40/40 [00:02<00:00, 14.03it/s, Training Loss=0.38] 
Validating epoch 3/3: 100%|██████████| 10/10 [00:00<00:00, 43.71it/s, Validation Loss=0.54]

Epoch: 3,Validation Loss: 0.6368380784988403





In [115]:
input_str='Jaundice'

In [116]:
input_ids=tokenizer.encode(input_str,return_tensors='pt').to(device)

In [118]:
input_ids

tensor([[   41, 14677,   501]], device='cuda:0')

In [119]:
output=model.generate(
    input_ids,
    max_length=20,
    num_return_sequences=1,
    do_sample=True,
    top_k=8,
    top_p=0.95,
    temperature=0.5,
    repetition_penalty=1.2
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [120]:
output

tensor([[   41, 14677,   501,   930,  2297,  1108,    11, 48140,    11,  2266,
          1108,    11, 48140, 50256]], device='cuda:0')

In [121]:
decode_output=tokenizer.decode(output[0],skip_special_tokens=True)

In [122]:
decode_output

'Jaundice | Redness, itching, redness, itching'

In [123]:
torch.save(model,'SmallDiseaseLM.pt')