<a href="https://colab.research.google.com/github/nrimsky/qa/blob/main/paper_qa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install -U InstructorEmbedding sentence-transformers pylatexenc faiss-cpu langchain openai

In [2]:
import os
import requests
import shutil
import tarfile
import re
from InstructorEmbedding import INSTRUCTOR
from pylatexenc.latex2text import LatexNodes2Text
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
import torch
import os
from google.colab import drive
from langchain.prompts import PromptTemplate
drive.mount('/content/drive')


  from tqdm.autonotebook import trange


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.environ['OPENAI_API_KEY'] = input("Paste OpenAI API Key: ")

In [4]:
model = INSTRUCTOR('hkunlp/instructor-xl')

Downloading (…)7f436/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)0daf57f436/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

Downloading (…)af57f436/config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)7f436/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading (…)f57f436/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Using device", device)

INDEX_TEXT = "Represent this section of a Machine Learning paper for retrieval given a question about the paper:"
RETRIEVAL_TEXT = "Represent this question about a Machine Learning paper for retrieving relevant sections of the paper:"

def encode_instructor(instruction, sentences):
    return model.encode([[instruction,sentence] for sentence in sentences])

def latex_to_text(latex_str):
    try:
      l2t = LatexNodes2Text()
      return l2t.latex_to_text(latex_str)
    except:
      return latex_str

def get_source(arxiv_id):
    source_url = f'https://arxiv.org/e-print/{arxiv_id}'
    response = requests.get(source_url, stream=True)
    drive_path = '/content/drive/My Drive/'
    if response.status_code == 200:
        with open(f'{drive_path}{arxiv_id}.tar.gz', 'wb') as f:
            response.raw.decode_content = True
            shutil.copyfileobj(response.raw, f)
    else:
        print(f'Error: received status code {response.status_code} from arXiv.')
    with tarfile.open(f'{drive_path}{arxiv_id}.tar.gz', 'r:gz') as f:
        f.extractall(path=f'{drive_path}{arxiv_id}_source_files')
    source_dir = f'{drive_path}{arxiv_id}_source_files'
    tex_files = [f for f in os.listdir(source_dir) if f.endswith('.tex')]
    file_contents = []
    text_length = 0
    for tex_file in tex_files:
        with open(os.path.join(source_dir, tex_file), 'r') as f:
            file_content = f.read()
            file_contents.append(file_content)
            text_length += len(file_content)
    if text_length <= 1000:
      print(f"Failed to extract enough source data - file content size = {text_length} chars")
    return file_contents

def clean_text(text):
    clean = re.sub("\n{3,}", "\n\n", text)
    return re.sub("={4,}", "\n", clean)

def extract_all_text_chunks(arxiv_id, n_character_chunks=1000):
    file_contents = get_source(arxiv_id)
    text_chunks = []
    for file_content in file_contents:
        text = latex_to_text(file_content)
        text = clean_text(text)
        text_chunks += [text[i:i + n_character_chunks] for i in range(0, len(text), n_character_chunks)]
    return text_chunks

class InstructorEmbeddings(Embeddings):

    def embed_documents(self, texts):
        return encode_instructor(INDEX_TEXT, texts)

    def embed_query(self, text):
        return encode_instructor(RETRIEVAL_TEXT, [text])[0]

def cli_ask_questions(arxiv_id):
    chunks = extract_all_text_chunks(arxiv_id)
    embeddings = InstructorEmbeddings()
    vectorstore = FAISS.from_texts(chunks, embeddings, [{"index": i} for i in range(len(chunks))])

    chain_type_kwargs = {
        "prompt": ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(
                "You are a helpful assistant that answers questions about research papers given some snippets from the paper. Whenever possible, you quote directly from the snippets, putting the quote in quotation marks."
            ),
            HumanMessagePromptTemplate.from_template("""
                Some relevant snippets:

                {context}

                Question: {question}
                Answer:
            """)
        ])
    }

    qa = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(model_name='gpt-3.5-turbo'),
        chain_type="stuff",
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs=chain_type_kwargs
    )

    while True:
        question = input("Enter your question about the paper (or 'quit' to stop): ")
        if question.lower() == 'quit':
            break
        else:
            try:
                print(qa.run(question))
            except Exception as e:
                print("An error occurred while processing your question.")
                print(str(e))


Using device cuda


In [6]:
arxiv_id = input("Enter the id of the Arxiv paper you want to ask questions about: ")
cli_ask_questions(arxiv_id)

Enter the id of the Arxiv paper you want to ask questions about: 1906.01820
Enter your question about the paper (or 'quit' to stop): What is a mesaoptimizer?
"A mesa-optimizer is a learned algorithm that is itself an optimizer."
Enter your question about the paper (or 'quit' to stop): Why may a mesoptimizer arise in an ML system?
A mesa-optimizer may arise in an ML system because of the phenomenon called mesa-optimization, which refers to the situation where a learned algorithm found by a base optimizer is itself an optimizer. This is more likely to occur in machine learning systems that are more advanced than those that exist today, according to the paper.
Enter your question about the paper (or 'quit' to stop): Why would advanced systems be optimizers?
Advanced systems would be optimizers because they internally search through a search space looking for those elements that score high according to some objective function that is explicitly represented within the system. This is true f