In [42]:
from langchain.document_loaders import PyPDFLoader
from langchain.chains.summarize import load_summarize_chain

# from langchain.embeddings import OpenAIEmbeddings
# from langchain.vectorstores import Chroma
# from langchain.chains import ChatVectorDBChain
from langchain.llms import OpenAI
from langchain import PromptTemplate
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import Ollama

# import pyalex
from pyalex import Works

from apikey import OPENAI_API_KEY

# pyalex.config.email = "mail@example.com"

In [43]:
import os

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [31]:
llm_openai = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

In [2]:
# DOI = "https://doi.org/10.48550/arXiv.2010.11929"
DOI = "https://doi.org/10.48550/arXiv.2308.02510"
paper_work_object = Works()[DOI]

In [3]:
paper_work_object.keys()

dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'type_crossref', 'indexed_in', 'open_access', 'authorships', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'has_fulltext', 'cited_by_count', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'referenced_works_count', 'referenced_works', 'related_works', 'ngrams_url', 'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year', 'updated_date', 'created_date'])

In [4]:
print(paper_work_object["open_access"])
full_text_url = paper_work_object["open_access"]["oa_url"]
print(full_text_url)

{'is_oa': True, 'oa_status': 'green', 'oa_url': 'https://arxiv.org/pdf/2308.02510', 'any_repository_has_fulltext': True}
https://arxiv.org/pdf/2308.02510


In [17]:
loader = PyPDFLoader(full_text_url)
pages = loader.load_and_split()
len(pages)
# from pprint import pprint

15

In [18]:
llm = Ollama(
    model="mistral:latest",
    # callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)

In [10]:
map_prompt = """
Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

In [11]:
combine_prompt = """
Write a concise summary of the following text delimited by triple backquotes.
Return your response in bullet points which covers the key points of the text.
```{text}```
BULLET POINT SUMMARY:
"""
combine_prompt_template = PromptTemplate(
    template=combine_prompt, input_variables=["text"]
)

In [34]:
chain = load_summarize_chain(llm_openai, chain_type="map_reduce")
summary_openai = chain.invoke(pages)

In [36]:
summary_openai["output_text"]

'\n\nThis paper discusses a new method, N EURO IMAGEN, for reconstructing high-resolution images from EEG signals. It uses multi-level semantics extraction and a latent diffusion model to improve accuracy and overcome challenges of using EEG data. The method is compared to previous approaches and found to be superior. It also discusses the use of language models and evaluation metrics in the process. The study validates the method and demonstrates its effectiveness in generating high-quality images from brain signals. Other related works in the field are also mentioned.'

In [19]:
chain = load_summarize_chain(
    llm,
    chain_type="map_reduce",
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    # verbose=True
)
summary = chain.invoke(pages)
# print(summary["output_text"])

In [25]:
print(summary["output_text"])

 * In 2012, Krizhevsky et al. introduced deep learning models for image classification in the ImageNet Large Scale Visual Recognition Challenge (ILSVRC).
* In 2015, Schroff et al. proposed a deep neural network for face recognition using Facenet.
* In 2016, Deng et al. presented NeuroVision: end-to-end deep image reconstruction from brain activity.
* In 2017, Spampinato et al. introduced DeepMindSight, a neural network system that can learn to decode high-level visual percepts from human brain signals.
* In 2018, Chen et al. presented Brain2Image: generating high-resolution images from EEG signals using generative adversarial networks (GANs).
* In 2019, Shen et al. introduced N EURO IMAGEN: a method for reconstructing images from EEG signals using a multi-modal deep learning approach.
* In 2020, Zeng et al. presented the Controllable Mind Visual Diffusion Model for generating high-quality images and controlling their semantics using brain activity.
* In 2020, Zhao et al. introduced DAR

## download pdf manually (deprecated)


In [16]:
# TODO: get pdf using open alex
pdf_path = "./paper_pdf/paper.pdf"
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()
# print(pages[0].page_content)

In [21]:
llm("Tell me about the history of AI")

ConnectionError: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/generate/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x148a67440>: Failed to establish a new connection: [Errno 61] Connection refused'))

## need open ai key


In [13]:
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(pages, embedding=embeddings, persist_directory=".")
vectordb.persist()

In [None]:
pdf_qa = ChatVectorDBChain.from_llm(
    OpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
    vectordb,
    return_source_documents=True,
)

query = "What is the VideoTaskformer?"
result = pdf_qa({"question": query, "chat_history": ""})
print("Answer:")
print(result["answer"])

## Reference


- https://github.com/EnkrateiaLucca/summarization_with_langchain (langchain sum example)
- https://medium.aiplanet.com/implementing-rag-using-langchain-ollama-and-chainlit-on-windows-using-wsl-92d14472f15d (rag with langchain & ollama)
- https://www.youtube.com/watch?v=BLM3KDaOTJM&list=PLQIgLu3Wf-q_Ne8vv-ZXuJ4mztHJaQb_v&index=4 (langchain tutorial)
- https://python.langchain.com/docs/integrations/llms/ollama (ollama with langchain)
- https://python.langchain.com/docs/use_cases/summarization (langchain summarization)
- https://www.youtube.com/watch?v=f9_BWhCI4Zo (work around for token limit)
- https://github.com/J535D165/pyalex?tab=readme-ov-file (pyalex documentation)
- https://www.youtube.com/watch?v=qaPMdcCqtWk&t=246s (summarization on different length input)
- https://www.youtube.com/watch?v=zlUOsSCkQaU (how summarization works tutorial) (good)
- https://github.com/gkamradt/langchain-tutorials/blob/main/LangChain%20Cookbook%20Part%201%20-%20Fundamentals.ipynb (langchain core concepts)

- https://medium.com/@zaiinn440/a-comparative-analysis-of-llms-like-bert-bart-and-t5-a4a873251ff (BART & BERT)
- https://python.langchain.com/docs/expression_language/why (langchain documentation)


## TODOs


- Summarization per section (need preprocessing. how?)
- Various methods of large-text summarization (stuffing, map reduce, map reduce, refine, map-rerank)
- possible to run locally? which machine?
- Use Text splitter instead of pdf split (RecursiveCharacterTextSplitter)
- Embed documents -> clustering to select best documents (?)
- prompt engineering
- evaluation metrics
- read langchain documentation
- RAG?
- integrate citations
- Athina AI -> evaluation framework (https://github.com/athina-ai/ariadne?tab=readme-ov-file) + langchain evaluation


## Questions


- nothing new. just a pipeline. is this what the company expects?
- don't have a firm idea of what input/output should be + where it will be used
- how to search for certain paper? doi? what would happen if paper doesn't provide open access? (https://docs.openalex.org/api-entities/works/get-a-single-work)
- local vs openai (local may take long time)
- cache? for accessed papers?


## ETC


- Can't attend next meeting
- from now: reading papers, langchain, implementation
