In [1]:
## Install the necessary packages
!pip -q install torch
!pip -q install langchain
!pip -q install bitsandbytes accelerate transformers sentence-transformers
!pip -q install faiss-gpu
!pip -q install pypdf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m867.6/867.6 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.7/116.7 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.5/142.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [2]:
## Mount Google Drive for access to the papers
from google.colab import drive
drive.mount('/content/drive')

## import the Hugging Face token
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('ISM_RAG_HF')

Mounted at /content/drive


In [3]:
#### User specification ####

## Hugging Face LLM model that will be used to answer the queries
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

## Maximum number of new tokens and temperature (i.e. randomness) for the text generation pipeline
pipeline_temperature = 0.3
max_new_tokens = 500 # Best to keep it above 400

## The embedding model from Hugging Face
emb_model = 'Snowflake/snowflake-arctic-embed-m'

## Path to the directory of the star formation review papers
dir_path = '/content/drive/MyDrive/astro_papers/*'

## Parameters to parse the Arxiv papers. Overlap to ensure a limited loss of context
chunk_size = 400
chunk_overlap = 80

############################

In [4]:
#### import the necessary libraries ####
import time
from glob import glob
from tqdm.notebook import tqdm

## The transformers-related libraries
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

## To parse the PDF
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import pypdf

## To create a vector store
from langchain.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

## To create an LLM pipeline
from langchain.llms import HuggingFacePipeline

## To create the QA retrieval prompt
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

#########################################

In [5]:
## Verify that a CUDA capable GPU is available
if torch.cuda.is_available():
    print("CUDA GPU is available.")
    print('Using GPU: ', torch.cuda.get_device_name(0))
else:
    print("No CUDA GPU is available. This will likely cause issues down the road")

CUDA GPU is available.
Using GPU:  Tesla T4


##  Implementation of the Information Retrieval System

In [6]:
## The vector database class
class Vec_db():
    ## Input:
    ## - dir_path: Str, the directory with the PDF files/papers
    ## - emb_model: Str, the HuggingFace directory of the embedding model for the vector database
    ## - chunk_size: Int, the size of the parsed text blocks
    ## - chunk_overlap: Int, the overlap size of the parsed text blocks
    def __init__(self, dir_path, emb_model, chunk_size =  256, chunk_overlap = 25):
        ## Define the size of the chunks and their overlap for the PDF parsing
        self.chunk_size, self.chunk_overlap = chunk_size, chunk_overlap

        ## Define the Retrieval vector database
        self.ret_db = self.__load_db(dir_path, emb_model)



    #### private functions to Initialize the vector database ####



    ## Function that parses a list of PDF papers
    ## Input:
    ## - dir_path: Str, the directory with the PDF files/papers
    ## Return:
    ## - List[Str], a list with all the parsed text chunks from the provided PDF files
    def __parse_papers(self, dir_path):
        ## Load the paths to all papers in a List
        paper_paths = glob(dir_path)

        ## List that will hold all the parsed data
        all_chunks = []

        ## Initialize a progress bar to track progress of the PDF parsing
        pdf_progress = tqdm(total = len(paper_paths), desc = "Parsed PDFs")

        ## Loop over all papers
        for idx, path in enumerate(paper_paths):
            ## Update the progress bar
            pdf_progress.update(1)

            try:
                ## load the PDF file into a text document
                loader = PyPDFLoader(path)
                doc = loader.load()

                ## Split the text into chunks
                text_splitter = CharacterTextSplitter(chunk_size = self.chunk_size, chunk_overlap = self.chunk_overlap)
                chunked_document = text_splitter.split_documents(doc)

                ## add all chunks in the list
                all_chunks.extend(chunked_document)

            ## Catch any Exception
            except Exception as error:
                print("Skipping the PDF {path} because {error}".format(path = path, error = error))

        ## Delete the progress bar
        pdf_progress.close()

        return all_chunks



    ## Function to define the vector database
    ## Input:
    ## - dir_path: Str, the directory with the PDF files/papers
    ## - emb_model: Str, the HuggingFace directory of the embedding model for the vector database
    ## Return:
    ## - A vector database with the parsed text from the PDFs
    def __load_db(self, dir_path, emb_model):
        ## Get the text chunks from the parsed PDFs
        all_chunks = self.__parse_papers(dir_path)

        ## Load the text chunks into a vector store with FAISS indexing
        print("Initializing the vector database")
        start_time = time.time()
        ret_db = FAISS.from_documents(documents = all_chunks, embedding = HuggingFaceEmbeddings(model_name = emb_model))
        end_time = time.time()

        ## Print the time it took to load the chunks into the database
        print("Creating the vector database took {duration} seconds".format(duration = end_time - start_time))
        
        print(ret_db.index.ntotal)

        return ret_db



    #### Public functions ####



    ## Function that returns the vector database
    ## Return:
    ## - a Vec_db: the retrieval vector database of the object
    def get_vec_db(self):
        return self.ret_db



    ## Function to add additional papers to the vector database
    ## Input:
    ## - dir_path: Str, the directory with the PDF files/papers
    def add_papers(self, dir_path):
        ## Get the text chunks from the parsed PDFs
        new_chunks = self.__parse_papers(dir_path)

        ## Add the additional text into the vectordatabase
        self.ret_db.add_documents(documents = new_chunks)


In [7]:
#### Initialize a text-generation pipeline to answer requests ####

## Configuration to quantize the model to improve performance on a GPU, and load the LLM Model for causal inference
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = "float16",
    bnb_4bit_use_double_quant = False,
)

print("Loading the model: {model_id}:".format(model_id = model_id))
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config = bnb_config, device_map = "auto")

## load the tokenizer for the LLM model and set the padding to the end-of-sequence
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

## Create the HuggingFace text generation pipeline
text_generation_pipeline = pipeline(
    model = model,
    tokenizer = tokenizer,
    task = "text-generation",
    temperature = pipeline_temperature,
    max_new_tokens = max_new_tokens,
    do_sample = True
)

llm_pipeline = HuggingFacePipeline(pipeline = text_generation_pipeline)

Loading the model: meta-llama/Meta-Llama-3-8B-Instruct:


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
## Define the instructions template for the prompt to handle the context provided through information retrieval
## This template is particularly suited for the Llama3 model: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
prompt_template_1 = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a friendly assistant with access to relevant context from research papers if there is any. Use this context to answer the question.
Start the answer with: Based on the available information,

<|eot_id|><|start_header_id|>user<|end_header_id|>

Relevant context:
{context}

The question to answer:
{question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

## Define variables of importance to process the prompt above
prompt_variables_1 = ["context", "question"]
start_of_the_answer_1 = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"

In [9]:
## Retrieval prompt class
class Retrieval_Prompt():
    ## Input:
    ## - llm_pipeline: A HuggingFacePipeline
    ## - db: A vector database
    ## - prompt_template: A prompt template
    ## - prompt_variables: List[Str], the input variables used by the prompt
    ## - start_answer: Str, a string that defines the location in the prompt text where the LLM answer starts
    def __init__(self, llm_pipeline, db, prompt_template, prompt_variables, start_answer):
        ## Define the string that identifies the start of the LLM answer in the prompt
        self.start_answer = start_answer

        ## The QA retrieval (qar) prompt chain
        self.qar_prompt = self.__build_qar_prompt(llm_pipeline, db, prompt_template, prompt_variables)


    #### Private functions to initialize the Retrieval prompt chain ####

    ## Create an instructions prompt
    ## Input:
    ## - prompt_template: A text template of the prompt
    ## - prompt_variables: List[Str], the input variables used by the prompt
    ## Return:
    ## - a PromptTemplate
    def __make_instructions_prompt(self, prompt_template, prompt_variables):
        ## Create the prompt from the prompt template
        prompt = PromptTemplate(input_variables = prompt_variables, template = prompt_template)

        return prompt



    ## Return a retriever for the vector database
    ## Input:
    ## - db: a vectorstore for retrieval
    ## Return:
    ## - A retriever object for the vector database based on similarity
    def __get_retriever(self, db):
        retriever = db.as_retriever(search_type = "similarity", search_kwargs = {'k': 5})

        return retriever



    ## Build the QA Retrieval prompt
    ## Input:
    ## - llm_pipeline: A HuggingFacePipeline
    ## - db: A vector database
    ## - prompt_template: A template for the prompt instructions
    ## - prompt_variables: List[Str], the input variables used by the prompt
    ## Return:
    ## - A QA Retrieval chain
    def __build_qar_prompt(self, llm_pipeline, db, prompt_template, prompt_variables):

        ## Get the prompt and object to retrieve from the vector database
        prompt = self.__make_instructions_prompt(prompt_template, prompt_variables)
        retriever = self.__get_retriever(db)

        ## Build the QA Retrieval prompt
        qar_prompt = RetrievalQA.from_chain_type(llm = llm_pipeline,
                                                      chain_type = "stuff", ## passes all the retrieved documents to the prompt
                                                      retriever = retriever,
                                                      chain_type_kwargs = {"prompt": prompt},
                                                      return_source_documents = True, ## retrieve the passed documents
                                                      verbose = False
                                                     )

        return qar_prompt



    ### Other Private functions ####

    ## Print the response based on an input question
    ## Input:
    ## - result: Str, the result output of the QAR prompt
    ## - query: Str, the query submitted to the QAR prompt
    def __print_response(self, result, query):

        ## print the question and answer
        print("Question: {query}\n\n".format(query = query))
        print("Answer:")

        ## Find the start index of the answer to the query
        idx = result.find(self.start_answer)

        ## extract the answer from the full result
        answer = result[idx + len(self.start_answer):]

        ## Print the answer when an answer was produced
        if(idx != -1):
            print(answer)



    ## Print the paper sources used for the response
    ## Input:
    ## - source_docs: List[Documents], A list of documents retrieved from the vector database
    def __print_sources(self, source_docs):
        print("\n\nThe following Arxiv papers provided context for the anwer:")

        ## Loop over each source and print the paper and page number
        for idx, source in enumerate(source_docs):
            ## get the metadata of the source
            metadata = source.metadata

            ## get the Arxiv paper ID number from the metadata dictionary
            paper = metadata['source'].split("/")[-1]

            ## print the associated paper and page
            print("{num}) Arxiv paper: {pap}, p. {page}".format(num = idx + 1, pap = paper, page = metadata['page']))



    #### Public functions of the QA retrieval prompt chain ####

    ## Query the prompt to obtain an output
    ## Input:
    ## - query: Str, the Question/Query for the QAR prompt
    ## - add_sources: Bool, whether or not to print the papers that supported the provided answer
    def query_prompt(self, query, add_sources = True):
        ## obtain a response from the qar_prompt
        response = self.qar_prompt.invoke(query)

        ## Extract & print the actual answer
        self.__print_response(response['result'], query)

        ## Add the source PDFs at the bottom
        if(add_sources):
            self.__print_sources(response['source_documents'])

## Create information retrieval prompt

In [10]:
## create the vector database
db1 = Vec_db(dir_path, emb_model, chunk_size = chunk_size, chunk_overlap = chunk_overlap)

Parsed PDFs:   0%|          | 0/100 [00:00<?, ?it/s]



Skipping the PDF /content/drive/MyDrive/llm_papers/2306.09339v1.pdf because Stream has ended unexpectedly
Initializing the vector database


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]






README.md:   0%|          | 0.00/83.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Creating the vector database took 65.45226621627808 seconds


In [11]:
## Build the QA retrieval prompt
ret_prompt1 = Retrieval_Prompt(llm_pipeline, db1.get_vec_db(), prompt_template_1, prompt_variables_1, start_of_the_answer_1)

## Submit queries

In [12]:
query = "How does the star formation rate evolve over cosmic time?"
ret_prompt1.query_prompt(query)

Question: What are Large Language Models?


Answer:


Based on the provided context, Large Language Models (LLMs) are a type of artificial neural network that are trained on vast amounts of text data to generate and understand human-like language. They are designed to model the probability of word sequences to predict the likelihood of future words, and are capable of understanding and generating text with structures similar to human language.

LLMs are typically trained on large datasets of text, such as books, articles, and websites, and are designed to learn patterns and relationships in the language. They are often pre-trained on a specific task, such as language translation or text classification, and then fine-tuned for a specific downstream task, such as chatbots, text summarization, or language translation.

Some notable examples of LLMs include BERT, RoBERTa, and XLNet, which have achieved state-of-the-art results in various natural language processing tasks. LLMs have also be

In [13]:
query2 = "Which physical processes regulate star formation in the Milky Way?"
ret_prompt1.query_prompt(query2)

Question: Are there datasets for Jamaican language processing?


Answer:


Based on the available information, there is a limited amount of research and datasets available for Jamaican Patois, which is a low-resource language. The language is an English-based creole spoken by over 3 million inhabitants on the island and by Jamaicans across the diaspora globally. The language has been studied in the context of natural language processing, with a few datasets and research papers focused on its linguistic features and processing.

In the provided table, there is no dataset specifically listed for Jamaican Patois. However, there are some datasets mentioned that are related to creole languages, such as NaijaSenti, XCOPA, and Colloquial Singaporean English (Singlish). These datasets may not be directly applicable to Jamaican Patois, but they could potentially be used as a starting point for developing a dataset for Jamaican Patois.

Additionally, there is a mention of the JamPatoisNLI datase

In [14]:
query3 = "What is the difference between hot cores and hot corinos?"
ret_prompt1.query_prompt(query3)

Question: What is Transformers-XL?


Answer:


Based on the available information, Transformers-XL is a type of transformer architecture that is designed to process longer sequences than traditional transformer models. It was introduced in a research paper titled "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context" by Z. Dai, Z. Yang, Y. Yang, J. Carbonell, Q. V. Le, and R. Salakhutdinov in 2019.

The main idea behind Transformers-XL is to use a combination of techniques to process longer sequences, including:

1. Segmenting the input sequence into smaller segments and processing each segment separately.
2. Using a hierarchical attention mechanism to focus on different parts of the input sequence.
3. Using a sparse attention mechanism to reduce the computational cost of attention computation.

The authors of the paper claim that Transformers-XL can process sequences of up to 10,000 tokens, which is much longer than the typical sequence length of 512 tokens for trad

In [15]:
query4 = "Which process determine star formation efficiency?"
ret_prompt1.query_prompt(query4, add_sources = False)

Question: How can Large Language Models help linguistic studies?


Answer:


Based on the available information, it can be concluded that Large Language Models (LLMs) can help linguistic studies in several ways:

1. **Cross-lingual analysis**: LLMs can facilitate cross-lingual analysis by leveraging the ability to learn representations that are transferable across languages, enabling researchers to analyze linguistic phenomena across languages.
2. **Language identification**: LLMs can be used to identify languages and dialects, which is essential for linguistic studies that involve analyzing linguistic data from diverse sources.
3. **Text classification**: LLMs can be fine-tuned for text classification tasks, such as sentiment analysis, topic modeling, and named entity recognition, which can help linguists analyze and understand the meaning and structure of texts.
4. **Language modeling**: LLMs can be used to model the probability distribution of words in a language, which can help lin

In [16]:
query5 = "Do filaments play a role in the formation of stars?"
ret_prompt1.query_prompt(query5)

Question: Can Large Language Models help detect crimes?


Answer:


Based on the available information, it seems that large language models have been used in various applications, including natural language processing, machine translation, and text classification. However, there is limited research on the specific topic of using large language models to detect crimes.

One study that is relevant to this topic is "Detecting Crime with Language Models" by researchers at the University of California, Berkeley. In this study, the authors used a large language model to analyze crime reports and identify patterns and trends in language usage that are associated with criminal activity. The study found that the model was able to detect crimes with a high degree of accuracy, and that the results were comparable to those obtained using traditional methods.

Another study that is relevant to this topic is "Crime Detection using Deep Learning" by researchers at the University of Texas at Austin. I

In [17]:
query6 = "What is a Giant Molecular Cloud?"
ret_prompt1.query_prompt(query6, add_sources = False)

Question: What is needed for llms to control robots?


Answer:


Based on the available information, it seems that the authors are discussing the limitations of Large Language Models (LLMs) in understanding natural language and their potential applications in various domains, including robotics. The question being asked is what is needed for LLMs to control robots.

To answer this question, we need to consider the current capabilities of LLMs and the requirements for controlling robots. LLMs are capable of processing and generating human-like language, but they lack the ability to understand the underlying semantics and context of the language. They can only recognize patterns and generate responses based on those patterns.

To control robots, LLMs need to be able to understand the language used to command them, which requires a deeper level of understanding of the language and its context. This is because robots require precise and specific instructions to perform tasks, and LLMs need

In [18]:
query7 = "When was the cosmic star formation peak?"
ret_prompt1.query_prompt(query7)

Question: Describe the impact of LLMs on endangered languages.


Answer:


Based on the available information, the impact of Large Language Models (LLMs) on endangered languages is a topic of ongoing research and debate. On one hand, LLMs have the potential to provide a powerful tool for language documentation, preservation, and revitalization. By leveraging the capabilities of LLMs, researchers and linguists can develop language models that can learn from and generate text in endangered languages, potentially helping to preserve the linguistic diversity of these languages.

For instance, the paper "IndicTrans2: Towards high-quality and accessible machine translation models for all 22 scheduled Indian languages" presents a machine translation model that can translate between 22 Indian languages, including several endangered languages. This model has the potential to facilitate language documentation, language learning, and cultural exchange among speakers of these languages.

On the ot