In [26]:
from langchain.document_loaders import PyPDFLoader
from langchain.chains.summarize import load_summarize_chain

# from langchain.embeddings import OpenAIEmbeddings
# from langchain.vectorstores import Chroma
# from langchain.chains import ChatVectorDBChain
# from langchain.llms import OpenAI
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import Ollama

# import pyalex
from pyalex import Works

# pyalex.config.email = "mail@example.com"

In [27]:
# DOI = "https://doi.org/10.48550/arXiv.2010.11929"
DOI = "https://doi.org/10.48550/arXiv.2308.02510"
paper_work_object = Works()[DOI]

In [28]:
paper_work_object.keys()

dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'type_crossref', 'indexed_in', 'open_access', 'authorships', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'has_fulltext', 'cited_by_count', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'referenced_works_count', 'referenced_works', 'related_works', 'ngrams_url', 'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year', 'updated_date', 'created_date'])

In [29]:
print(paper_work_object["open_access"])
full_text_url = paper_work_object["open_access"]["oa_url"]
print(full_text_url)

{'is_oa': True, 'oa_status': 'green', 'oa_url': 'https://arxiv.org/pdf/2308.02510', 'any_repository_has_fulltext': True}
https://arxiv.org/pdf/2308.02510


In [45]:
loader = PyPDFLoader(full_text_url)
pages = loader.load_and_split()
from pprint import pprint

# pprint(pages)

In [31]:
llm = Ollama(
    model="mistral:latest",
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)

In [46]:
chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = chain.invoke(pages)
# print(summary["output_text"])

KeyboardInterrupt: 

In [47]:
summary["output_text"]

' This summary covers research papers related to computer vision, neural networks, multimodal learning, and EEG signals:\n\n1. "NeuroVision: perceived image regeneration using cProGAN": Proposes a method for generating images based on human perception using conditional generative adversarial networks (cGAN).\n2. "ImageNet classification with deep convolutional neural networks" introduces ImageNet dataset and deep learning models to achieve high accuracy in image recognition.\n3. "Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models": Discusses a method for multimodal pre-training using large language models and frozen image encoders.\n4. "Geometric GAN" focuses on improving geometric consistency of generated images using Generative Adversarial Networks (GANs).\n5. "Mode seeking generative adversarial networks for diverse image synthesis": Presents a method for generating diverse images by exploiting modes in the data distribution.\n6. "

## download pdf manually (deprecated)


In [16]:
# TODO: get pdf using open alex
pdf_path = "./paper_pdf/paper.pdf"
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()
# print(pages[0].page_content)

In [21]:
llm("Tell me about the history of AI")

ConnectionError: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/generate/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x148a67440>: Failed to establish a new connection: [Errno 61] Connection refused'))

## need open ai key


In [13]:
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(pages, embedding=embeddings, persist_directory=".")
vectordb.persist()

In [None]:
pdf_qa = ChatVectorDBChain.from_llm(
    OpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
    vectordb,
    return_source_documents=True,
)

query = "What is the VideoTaskformer?"
result = pdf_qa({"question": query, "chat_history": ""})
print("Answer:")
print(result["answer"])

## Reference


- https://github.com/EnkrateiaLucca/summarization_with_langchain (langchain sum example)
- https://medium.aiplanet.com/implementing-rag-using-langchain-ollama-and-chainlit-on-windows-using-wsl-92d14472f15d (rag with langchain & ollama)
- https://www.youtube.com/watch?v=BLM3KDaOTJM&list=PLQIgLu3Wf-q_Ne8vv-ZXuJ4mztHJaQb_v&index=4 (langchain tutorial)
- https://python.langchain.com/docs/integrations/llms/ollama (ollama with langchain)
- https://www.youtube.com/watch?v=iMDBMTFT0ns (pdf sum using langchain)
- https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/5%20Levels%20Of%20Summarization%20-%20Novice%20To%20Expert.ipynb (5 ways of summarization)
- https://python.langchain.com/docs/use_cases/summarization (langchain summarization)
- https://www.youtube.com/watch?v=f9_BWhCI4Zo (work around for token limit)
- https://github.com/J535D165/pyalex?tab=readme-ov-file (pyalex documentation)


## Problems


- How to search for certain paper? (https://docs.openalex.org/api-entities/works/get-a-single-work)
- What if there is no full text access? -> many papers don't provide pdf
- can i use open ai api? restrictions?
- how to control output length? (default is too long)
