<a href="https://colab.research.google.com/github/mady1258/LLMTutorials/blob/main/ImplementingBioGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
#Provides an easy-to-use API for using pre-trained models for various NLP tasks
from transformers import pipeline, set_seed

#Specific classes for working with the BioGPT model
""" BioGptTokenizer class is used to tokenize text inputs in a format that can be processed by the BioGPT model.
The BioGptForCausalLM class is used to create an instance of the BioGPT model that can be used for language modeling
tasks such as generating text or completing prompts
"""
from transformers import BioGptTokenizer, BioGptForCausalLM

In [3]:
!pip install torch




In [4]:
!pip install sacremoses

"""
Sacremos is used internally for BioGPTTokenizer (It is used in Tokenizing and Normalizing text strings)
It includes Tokenization, Lowercasing, Deaccenting, Unicode Normalization(Unique code pointing to each character)
"""

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/897.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


'\nSacremos is used internally for BioGPTTokenizer (It is used in Tokenizing and Normalizing text strings)\nIt includes Tokenization, Lowercasing, Deaccenting, Unicode Normalization(Unique code pointing to each character)\n'

In [5]:
#Text generation
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt") #Loads a pre-trained BioGPT model from the Microsoft model hub

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

set_seed(42) #Line sets the random seed to ensure that the generated text is reproducible.

generator("COVID-19 is", max_length=20, num_return_sequences=5, do_sample=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/927k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/696k [00:00<?, ?B/s]

[{'generated_text': 'COVID-19 is still an ongoing pandemic.'},
 {'generated_text': 'COVID-19 is a worldwide pandemic that continues to spread around the globe.'},
 {'generated_text': 'COVID-19 is caused by a novel coronavirus.'},
 {'generated_text': 'COVID-19 is becoming more and more prevalent all over the world.'},
 {'generated_text': 'COVID-19 is associated with an increase in risk of cardiovascular disease events, suggesting an association between increased'}]

In [6]:
#Beam search decoding

import torch
from transformers import BioGptTokenizer, BioGptForCausalLM, set_seed

tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
sentence = "COVID-19 is"
inputs = tokenizer(sentence, return_tensors="pt")

set_seed(42)

with torch.no_grad():
    """
    Description:
    torch.no_grad() is a context manager that temporarily disables gradient calculation during model inference,
    which can help save memory and speed up processing

    """

    beam_output = model.generate(**inputs,   #Unpacks the inputs dictionary & passes it as keyword arguments to the generate() method
                                min_length=100, #Minimum number of words to generate
                                max_length=1024, #Maximum number of words to generate
                                num_beams=5,  #Describes the number of beams used in beam search algorithm, generates multiple likely sequences and selects the one with highest likelihood
                                early_stopping=True   #Stops the generation process as soon as the model predicts an end-of-sequence token
                                )
tokenizer.decode(beam_output[0], skip_special_tokens=True)  #Tells the tokenizer to exclude any special tokens that were added during encoding or generation

'COVID-19 is a global pandemic caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), the causative agent of coronavirus disease 2019 (COVID-19), which has spread to more than 200 countries and territories, including the United States (US), Canada, Australia, New Zealand, the United Kingdom (UK), and the United States of America (USA), as of March 11, 2020, with more than 800,000 confirmed cases and more than 800,000 deaths.'