<a href="https://colab.research.google.com/github/papeye/qArXiv/blob/master/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install -q langchain \
langchain_openai \
arxiv \
PyMuPDF \
chromadb \
langchainhub

###Enter doi of your paper below or chose one of the examples

In [None]:
doi = '2202.10488 - Charged Dark Matter in Supersymmetric Twin Higgs models' # @param ["2202.10488 - Charged Dark Matter in Supersymmetric Twin Higgs models"] {allow-input: true}
model = 'gpt-4' # @param ["gpt-3.5-turbo-16k", "gpt-4"]

doi = doi.split()[0]

import time
import os
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import ArxivLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.llms import Ollama
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.prompts import PromptTemplate

os.environ["OPENAI_API_KEY"] = '<Your OPENAI_API_KEY>'
os.environ["HUGGINGFACEHUB_API_TOKEN"]='<Your HUGGINGFACEHUB_API_TOKES>'

docs = ArxivLoader(query=doi, load_max_docs=2).load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


llm=ChatOpenAI(temperature=0.1,model_name=model)
#can also use Gemini:  https://python.langchain.com/docs/integrations/chat/google_generative_ai


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


response_schemas = [
    ResponseSchema(name="question1", description="question 1"),
    ResponseSchema(
        name="correctAnswer1",
        description="correct answer to the 1st generated question",
    ),
    ResponseSchema(
        name="incorrectAnswer11",
        description="incorrect answer to the 1st generated question",
    ),
    ResponseSchema(
        name="incorrectAnswer12",
        description="second incorrect answer to the 1st generated question",
    ),
    ResponseSchema(name="question2", description="question 2"),
    ResponseSchema(
        name="correctAnswer2",
        description="correct answer to the 2nd generated question",
    ),
    ResponseSchema(
        name="incorrectAnswer21",
        description="incorrect answer to the 2nd generated question",
    ),
    ResponseSchema(
        name="incorrectAnswer22",
        description="second incorrect answer to the 2nd generated question",
    ),
]


output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

# Prompt dependent on user
template = """You will prepare a quiz based on the following context which is a scientific paper: {context}
You want to ask 2 questions, provided with both correct and incorrect ansewrs, about the content of this paper in form of a quiz in following format: {format_instructions}
You should ask about the results in the paper such as general conclusions but not about the structure of the paper.
When you will see word "Quiz!" you should output the quiz: {quiz}"""

custom_rag_prompt = PromptTemplate.from_template(template, partial_variables={"format_instructions": format_instructions},)

rag_chain = (
    {"context": retriever | format_docs, "quiz": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | output_parser
)


class Output:
  def __init__(self, output):
        self.question1 = output['question1']
        self.question2 = output['question2']
        self.correctAnswer1 = output['correctAnswer1']
        self.correctAnswer2 = output['correctAnswer2']
        self.incorrectAnswer11 = output['incorrectAnswer11']
        self.incorrectAnswer12 = output['incorrectAnswer12']
        self.incorrectAnswer21 = output['incorrectAnswer21']
        self.incorrectAnswer22 = output['incorrectAnswer22']


  def __str__(self):
        return f'''
{self.question1} \n
        {self.correctAnswer1} (correct) \n
        {self.incorrectAnswer11} (incorrect) \n
        {self.incorrectAnswer12} (incorrect) \n
        \n
{self.question1} \n
        {self.correctAnswer2} (correct)\n
        {self.incorrectAnswer21} (incorrect) \n
        {self.incorrectAnswer22} (incorrect) \n
        '''


start_time = time.time()

output = Output(rag_chain.invoke("Quiz!"))

print(output)

print("--- %s seconds ---" % (time.time() - start_time))

vectorstore.delete_collection()


What is the SUSY breaking Majorana mass of ˜X referred to in the paper? 
 
        m ˜X (correct) 
 
        vS ˜S (incorrect) 

        2m ˜X ˜X ˜X (incorrect) 

        

What is the SUSY breaking Majorana mass of ˜X referred to in the paper? 
 
        Z (correct)

        X (incorrect) 

        S (incorrect) 

        
--- 5.322651147842407 seconds ---
