In [1]:
## Install the necessary packages
!pip -q install torch
!pip -q install langchain
!pip -q install langchain-community
!pip -q install bitsandbytes accelerate transformers sentence-transformers
!pip -q install faiss-gpu
!pip -q install pypdf

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.4/2.4 MB[0m [31m96.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.5/409.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
## Mount Google Drive for access to the papers
from google.colab import drive
drive.mount('/content/drive')

## import the Hugging Face token
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('ISM_RAG_HF')

Mounted at /content/drive


In [3]:
#### User specification ####

## Hugging Face LLM model that will be used to answer the queries
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

## Maximum number of new tokens and temperature (i.e. randomness) for the text generation pipeline
pipeline_temperature = 0.3
max_new_tokens = 500 # Best to keep it above 400

## The embedding model from Hugging Face
emb_model = 'Snowflake/snowflake-arctic-embed-m'

## Path to the directory of the star formation review papers
dir_path = '/content/drive/MyDrive/astro_papers/*'

## Parameters to parse the Arxiv papers. Overlap to ensure a limited loss of context
chunk_size = 400
chunk_overlap = 80

############################

In [4]:
#### import the necessary libraries ####
import time
from glob import glob
from tqdm.notebook import tqdm

## The transformers-related libraries
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

## To parse the PDF
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import pypdf

## To create a vector store
from langchain.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

## To create an LLM pipeline
from langchain.llms import HuggingFacePipeline

## To create the QA retrieval prompt
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

#########################################

In [5]:
## Verify that a CUDA capable GPU is available
if torch.cuda.is_available():
    print("CUDA GPU is available.")
    print('Using GPU: ', torch.cuda.get_device_name(0))
else:
    print("No CUDA GPU is available. This will likely cause issues down the road")

CUDA GPU is available.
Using GPU:  Tesla T4


##  Implementation of the Information Retrieval System

In [6]:
## The vector database class
class Vec_db():
    ## Input:
    ## - dir_path: Str, the directory with the PDF files/papers
    ## - emb_model: Str, the HuggingFace directory of the embedding model for the vector database
    ## - chunk_size: Int, the size of the parsed text blocks
    ## - chunk_overlap: Int, the overlap size of the parsed text blocks
    def __init__(self, dir_path, emb_model, chunk_size =  256, chunk_overlap = 25):
        ## Define the size of the chunks and their overlap for the PDF parsing
        self.chunk_size, self.chunk_overlap = chunk_size, chunk_overlap

        ## Define the Retrieval vector database
        self.ret_db = self.__load_db(dir_path, emb_model)



    #### private functions to Initialize the vector database ####



    ## Function that parses a list of PDF papers
    ## Input:
    ## - dir_path: Str, the directory with the PDF files/papers
    ## Return:
    ## - List[Str], a list with all the parsed text chunks from the provided PDF files
    def __parse_papers(self, dir_path):
        ## Load the paths to all papers in a List
        paper_paths = glob(dir_path)

        ## List that will hold all the parsed data
        all_chunks = []

        ## Initialize a progress bar to track progress of the PDF parsing
        pdf_progress = tqdm(total = len(paper_paths), desc = "Parsed PDFs")

        ## Loop over all papers
        for idx, path in enumerate(paper_paths):
            ## Update the progress bar
            pdf_progress.update(1)

            try:
                ## load the PDF file into a text document
                loader = PyPDFLoader(path)
                doc = loader.load()

                ## Split the text into chunks
                text_splitter = CharacterTextSplitter(chunk_size = self.chunk_size, chunk_overlap = self.chunk_overlap)
                chunked_document = text_splitter.split_documents(doc)

                ## add all chunks in the list
                all_chunks.extend(chunked_document)

            ## Catch any Exception
            except Exception as error:
                print("Skipping the PDF {path} because {error}".format(path = path, error = error))

        ## Delete the progress bar
        pdf_progress.close()

        return all_chunks



    ## Function to define the vector database
    ## Input:
    ## - dir_path: Str, the directory with the PDF files/papers
    ## - emb_model: Str, the HuggingFace directory of the embedding model for the vector database
    ## Return:
    ## - A vector database with the parsed text from the PDFs
    def __load_db(self, dir_path, emb_model):
        ## Get the text chunks from the parsed PDFs
        all_chunks = self.__parse_papers(dir_path)

        ## Load the text chunks into a vector store with FAISS indexing
        print("Initializing the vector database")
        start_time = time.time()
        ret_db = FAISS.from_documents(documents = all_chunks, embedding = HuggingFaceEmbeddings(model_name = emb_model))
        end_time = time.time()

        ## Print the time it took to load the chunks into the database
        print("Creating the vector database took {duration} seconds".format(duration = end_time - start_time))

        print(ret_db.index.ntotal)

        return ret_db



    #### Public functions ####



    ## Function that returns the vector database
    ## Return:
    ## - a Vec_db: the retrieval vector database of the object
    def get_vec_db(self):
        return self.ret_db



    ## Function to add additional papers to the vector database
    ## Input:
    ## - dir_path: Str, the directory with the PDF files/papers
    def add_papers(self, dir_path):
        ## Get the text chunks from the parsed PDFs
        new_chunks = self.__parse_papers(dir_path)

        ## Add the additional text into the vectordatabase
        self.ret_db.add_documents(documents = new_chunks)


In [7]:
#### Initialize a text-generation pipeline to answer requests ####

## Configuration to quantize the model to improve performance on a GPU, and load the LLM Model for causal inference
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = "float16",
    bnb_4bit_use_double_quant = False,
)

print("Loading the model: {model_id}:".format(model_id = model_id))
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config = bnb_config, device_map = "auto")

## load the tokenizer for the LLM model and set the padding to the end-of-sequence
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

## Create the HuggingFace text generation pipeline
text_generation_pipeline = pipeline(
    model = model,
    tokenizer = tokenizer,
    task = "text-generation",
    temperature = pipeline_temperature,
    max_new_tokens = max_new_tokens,
    do_sample = True
)

llm_pipeline = HuggingFacePipeline(pipeline = text_generation_pipeline)

Loading the model: meta-llama/Meta-Llama-3-8B-Instruct:


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

  llm_pipeline = HuggingFacePipeline(pipeline = text_generation_pipeline)


In [8]:
## Define the instructions template for the prompt to handle the context provided through information retrieval
## This template is particularly suited for the Llama3 model: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
prompt_template_1 = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a friendly assistant with access to relevant context from research papers if there is any. Use this context to answer the question with 100 to 200 words.
If a part of the context is not written text, you can ignore it.
Start the answer with: Based on the available information,

<|eot_id|><|start_header_id|>user<|end_header_id|>

Relevant context:
{context}

The question to answer:
{question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

## Define variables of importance to process the prompt above
prompt_variables_1 = ["context", "question"]
start_of_the_answer_1 = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"

In [9]:
## Retrieval prompt class
class Retrieval_Prompt():
    ## Input:
    ## - llm_pipeline: A HuggingFacePipeline
    ## - db: A vector database
    ## - prompt_template: A prompt template
    ## - prompt_variables: List[Str], the input variables used by the prompt
    ## - start_answer: Str, a string that defines the location in the prompt text where the LLM answer starts
    def __init__(self, llm_pipeline, db, prompt_template, prompt_variables, start_answer):
        ## Define the string that identifies the start of the LLM answer in the prompt
        self.start_answer = start_answer

        ## The QA retrieval (qar) prompt chain
        self.qar_prompt = self.__build_qar_prompt(llm_pipeline, db, prompt_template, prompt_variables)


    #### Private functions to initialize the Retrieval prompt chain ####

    ## Create an instructions prompt
    ## Input:
    ## - prompt_template: A text template of the prompt
    ## - prompt_variables: List[Str], the input variables used by the prompt
    ## Return:
    ## - a PromptTemplate
    def __make_instructions_prompt(self, prompt_template, prompt_variables):
        ## Create the prompt from the prompt template
        prompt = PromptTemplate(input_variables = prompt_variables, template = prompt_template)

        return prompt



    ## Return a retriever for the vector database
    ## Input:
    ## - db: a vectorstore for retrieval
    ## Return:
    ## - A retriever object for the vector database based on similarity
    def __get_retriever(self, db):
        retriever = db.as_retriever(search_type = "similarity", search_kwargs = {'k': 5})

        return retriever



    ## Build the QA Retrieval prompt
    ## Input:
    ## - llm_pipeline: A HuggingFacePipeline
    ## - db: A vector database
    ## - prompt_template: A template for the prompt instructions
    ## - prompt_variables: List[Str], the input variables used by the prompt
    ## Return:
    ## - A QA Retrieval chain
    def __build_qar_prompt(self, llm_pipeline, db, prompt_template, prompt_variables):

        ## Get the prompt and object to retrieve from the vector database
        prompt = self.__make_instructions_prompt(prompt_template, prompt_variables)
        retriever = self.__get_retriever(db)

        ## Build the QA Retrieval prompt
        qar_prompt = RetrievalQA.from_chain_type(llm = llm_pipeline,
                                                      chain_type = "stuff", ## passes all the retrieved documents to the prompt
                                                      retriever = retriever,
                                                      chain_type_kwargs = {"prompt": prompt},
                                                      return_source_documents = True, ## retrieve the passed documents
                                                      verbose = False
                                                     )

        return qar_prompt



    ### Other Private functions ####

    ## Print the response based on an input question
    ## Input:
    ## - result: Str, the result output of the QAR prompt
    ## - query: Str, the query submitted to the QAR prompt
    def __print_response(self, result, query):

        ## print the question and answer
        print("Question: {query}\n\n".format(query = query))
        print("Answer:")

        ## Find the start index of the answer to the query
        idx = result.find(self.start_answer)

        ## extract the answer from the full result
        answer = result[idx + len(self.start_answer):]

        ## Print the answer when an answer was produced
        if(idx != -1):
            print(answer)



    ## Print the paper sources used for the response
    ## Input:
    ## - source_docs: List[Documents], A list of documents retrieved from the vector database
    def __print_sources(self, source_docs):
        print("\n\nThe following Arxiv papers provided context for the anwer:")

        ## Loop over each source and print the paper and page number
        for idx, source in enumerate(source_docs):
            ## get the metadata of the source
            metadata = source.metadata

            ## get the Arxiv paper ID number from the metadata dictionary
            paper = metadata['source'].split("/")[-1]

            ## print the associated paper and page
            print("{num}) Arxiv paper: {pap}, p. {page}".format(num = idx + 1, pap = paper, page = metadata['page']))



    #### Public functions of the QA retrieval prompt chain ####

    ## Query the prompt to obtain an output
    ## Input:
    ## - query: Str, the Question/Query for the QAR prompt
    ## - add_sources: Bool, whether or not to print the papers that supported the provided answer
    def query_prompt(self, query, add_sources = True):
        ## obtain a response from the qar_prompt
        try:
          response = self.qar_prompt.invoke(query)

          ## Extract & print the actual answer
          self.__print_response(response['result'], query)

          ## Add the source PDFs at the bottom
          if(add_sources):
              self.__print_sources(response['source_documents'])

        except:
          print("The model was not able to generate a response.")

## Create information retrieval prompt

In [10]:
## create the vector database
db1 = Vec_db(dir_path, emb_model, chunk_size = chunk_size, chunk_overlap = chunk_overlap)

Parsed PDFs:   0%|          | 0/51 [00:00<?, ?it/s]

Initializing the vector database


  ret_db = FAISS.from_documents(documents = all_chunks, embedding = HuggingFaceEmbeddings(model_name = emb_model))


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/85.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Creating the vector database took 128.35365653038025 seconds
2997


In [11]:
## Build the QA retrieval prompt
ret_prompt1 = Retrieval_Prompt(llm_pipeline, db1.get_vec_db(), prompt_template_1, prompt_variables_1, start_of_the_answer_1)

## Submit queries

In [12]:
query = "How does the star formation rate evolve over cosmic time?"
ret_prompt1.query_prompt(query)

Question: How does the star formation rate evolve over cosmic time?


Answer:


Based on the available information, the star formation rate in galaxies has evolved over cosmic time. The star formation rate in the early universe was higher than it is today, with the dominant mode of evolution being a gradual decline of the average star formation rate. The star formation rate at z=2 was larger by a factor of 4 and 30 than that in star-forming galaxies at z=1 and 0, respectively. The less massive galaxies appear to have a longer e-folding time and a later onset of their star formation history.

The color distributions of galaxies also reflect this mass dependence of their star formation history. The bimodality of the color distribution is seen at all redshifts z<2.5, with the red sequence persisting to beyond z=2, indicating that many galaxies had completed their star-forming lives by this time. The blue galaxies, as defined by their color and star formation rates, dominate the stellar ma

In [13]:
query2 = "Which physical processes regulate star formation in the Milky Way?"
ret_prompt1.query_prompt(query2)

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Question: Which physical processes regulate star formation in the Milky Way?


Answer:


What is 1023D. 1023. The 1023D. 1023D. The 1023D. The 1024. The 1024. The 1024. The 1025. The 1027. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024.The 1024.. able 1024. versa The 1024. acclaimed 1024.. The 1024.. The 1024. The 1024. The 1024.. The 1024. Ã�1024. The 1024.://1024. versa The 1024. influential 1024. Ã�1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 1024. The 102

In [14]:
query3 = "What is the difference between hot cores and hot corinos?"
ret_prompt1.query_prompt(query3)

Question: What is the difference between hot cores and hot corinos?


Answer:


Based on the available information, hot cores and hot corinos are two types of high-mass star-forming regions that are characterized by their temperature and density conditions.

Hot cores are dense, hot regions of gas and dust that form around massive stars. They are typically found in the vicinity of massive stars, and are characterized by temperatures ranging from 100 to 300 K and densities of 10^4 to 10^6 cm^-3. Hot cores are thought to be the sites of massive star formation, and are often associated with the formation of massive stars and the destruction of nearby molecular clouds.

Hot corinos, on the other hand, are less dense and cooler regions of gas and dust that form around less massive stars. They are typically found in the vicinity of lower-mass stars, and are characterized by temperatures ranging from 20 to 100 K and densities of 10^3 to 10^5 cm^-3. Hot corinos are thought to be the sites of l

In [15]:
query4 = "Which processes determine star formation efficiency?"
ret_prompt1.query_prompt(query4, add_sources = False)

Question: Which processes determine star formation efficiency?


Answer:


Based on the provided context, the main points to address the question "Which processes determine star formation efficiency?" are:

1. The star formation rate (SFR) is a complex process that depends on various factors, including the availability of gas, the presence of dark matter, and the efficiency of star formation.
2. The SFR is influenced by the properties of the interstellar medium (ISM), such as the density, temperature, and turbulence of the gas.
3. The ISM is shaped by various physical processes, including supernova explosions, stellar winds, and radiation from massive stars.
4. The star formation efficiency (SFE) is the ratio of the SFR to the available gas supply. It is a key parameter that determines the rate at which stars are formed.
5. The SFE is influenced by the properties of the gas, such as its density, temperature, and metallicity.
6. The SFE is also affected by the presence of dark matter, w

In [16]:
query5 = "Do filaments play a role in the formation of stars?"
ret_prompt1.query_prompt(query5)

Question: Do filaments play a role in the formation of stars?


Answer:


S, 2001, 200 200:1..swing, 200 2000 200 2001..swing 2001 200 2000, 2001..swing, 2009: 2001, 2001 2001: 2001, 2001..swing, 2001. versa
T, 200 2001 2001, 2001 2001, 2001. influential 2001, 2001, 2001..swing, 2001: 2001, 2001 2001, 2001, 2001, 2001, 2001,2001
S 2001, 2001, 2001, 2001, 2001,2001, 2001,2001. versa,2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001,2001, 2001, 2001,2001,2001, 2001, 2001, 2001, 2001, 2001,2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001,2001,2001, 2001, 2001, 2001,2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001,2001, 2001,2001,2001, 2001, 2001, 2001,2001, 2001, 2001, 2001, 2001, 2001,2001, 2001, 2001,2001, 2001, 2001,2001, 2001, 2001, 2001, 200


The following Arxiv papers provided context for the anwer:
1) Arxiv paper: 1204.3552.pdf, p. 38
2) Arxiv paper: 1011.14

In [17]:
query6 = "What is a Giant Molecular Cloud?"
ret_prompt1.query_prompt(query6, add_sources = False)

Question: What is a Giant Molecular Cloud?


Answer:


Based on the available information, a Giant Molecular Cloud (GMC) is a massive molecular cloud complex that forms high-mass stars. These clouds are typically found at distances of less than 3 kpc and have masses ranging from 3 × 10^5 to 8 × 10^6 M⊙. They are characterized by high densities, with average densities ranging from 10^4 to 10^6 cm^-3, and are often found in regions of high star formation activity.

GMCs are thought to be the precursors of young massive clusters, and their study has allowed researchers to make significant progress in understanding the formation of high-mass stars. The most nearby, massive GMCs are rich star-forming sites, and their study has revealed the presence of massive filaments and hubs, which are thought to be the sites of high-mass star formation.

The data presented in Table 3 provides a list of young massive clusters in the Milky Way, along with their masses and radii. The table also includes re

In [18]:
query7 = "When was the cosmic star formation peak?"
ret_prompt1.query_prompt(query7)

Question: When was the cosmic star formation peak?


Answer:


Based on the provided text, the cosmic star formation peak is not explicitly stated. However, it is mentioned that the star formation rate in the universe has gradually declined since z = 2, with the dominant mode of evolution being a gradual decline of the average star formation rate. It is also mentioned that the star formation rate at z = 2 was larger by a factor of ∼4 and ∼30 than that in star-forming galaxies at z = 1 and 0.

From this information, we can infer that the cosmic star formation peak likely occurred at a redshift around z = 2, when the star formation rate was higher than at later times.


The following Arxiv papers provided context for the anwer:
1) Arxiv paper: 1403.0007.pdf, p. 73
2) Arxiv paper: 1204.3552.pdf, p. 36
3) Arxiv paper: 1403.0007.pdf, p. 72
4) Arxiv paper: 2303.12500.pdf, p. 68
5) Arxiv paper: 1101.1771.pdf, p. 55
