# Load Libraries

In [1]:
!git clone https://github.com/michalkrawczyk/Arxiv_GPT_Summarizer.git
%cd Arxiv_GPT_Summarizer



Cloning into 'Arxiv_GPT_Summarizer'...
remote: Enumerating objects: 567, done.[K
remote: Counting objects: 100% (170/170), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 567 (delta 89), reused 137 (delta 64), pack-reused 397[K
Receiving objects: 100% (567/567), 1.89 MiB | 11.12 MiB/s, done.
Resolving deltas: 100% (294/294), done.
/content/Arxiv_GPT_Summarizer
Branch 'langchain' set up to track remote branch 'langchain' from 'origin'.
Switched to a new branch 'langchain'


In [2]:
!pip install .

from IPython.display import clear_output
clear_output()
print("Libraries Installed")

Libraries Installed


In [3]:
from tqdm import tqdm

import json
import os
from shutil import move
from time import sleep
import yaml
from google.colab import userdata

##LOAD you openai API key and run cell below

In [4]:
try:
    import openai

    with open("/content/Arxiv_GPT_Summarizer/openai_key.yaml", "r") as f:
        API_KEY = yaml.safe_load(f)["openai_api_key"]
        openai.api_key = API_KEY if API_KEY != "OPENAI_API_KEY" else  userdata.get("openai_key")

        OPENAI_AVAILABLE = bool(openai.api_key) and openai.api_key != "OPENAI_API_KEY"

except Exception as err:
    OPENAI_AVAILABLE = False
    print(err)

print("OPENAI loaded:", OPENAI_AVAILABLE)
assert OPENAI_AVAILABLE, "OpenAI not available - check the key"

OPENAI loaded: True


## Define Prompts
PromptHolder class is used to store or create predefined langchain prompts for later usage.

It is created to provide invidual set of prompts for different PaperDatasetLC instances

(among others for different models or purposes - e.g. for one dataset with medical topics and one for financial).
<br><br>
If user don't want to use multiple instances,
he can use default prompt holder (DEFAULT_PROMPT_REGISTER) from 'templates' module,
which is used by default when no PromptHolder is provided.

In [5]:
from templates import create_and_register_prompt, register_prompt, PromptHolder, DEFAULT_PROMPT_REGISTER

## Create and Register Prompts
Those register functions are provided to quickly register prompts to given prompt holder.

They're outside PromptHolder class, as they are also made as convenient shortcuts to add prompts to default prompt holder.

In [6]:
features_prompt = """Create short, specific summary for research paper. Identify the following items for given text:
  - Model Name
  - Model category(e.g Object Detection, NLP or image generation)
  - SOTA: if Model is State-of-the-Art
  - New Features: Introduced new model components, layers or other features, as keywords
  - New Strategies: New introduced learning strategies
  - Year: Year of publishing

  text: {text}

  {format_instructions}
  """

In [7]:
create_and_register_prompt(name="identify_features", template=features_prompt, input_variables=["text", "format_instructions"])

is equivalent to:

In [8]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(template=features_prompt, input_variables=["text", "format_instructions"])
DEFAULT_PROMPT_REGISTER.load_defined_prompt(name="identify_features", prompt=prompt)



or

In [None]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(template=features_prompt, input_variables=["text", "format_instructions"])
register_prompt(name="identify_features", prompt=prompt)



In [None]:
#TODO:PromptHolder other than default
#TODO: Add in project option to load prompt from yaml

# (Optional) Arxiv Utils

In [9]:
from arxiv_utils import download_paper_from_arxiv, download_recent_papers_by_querry
#TODO: Examples

# Paper Dataset
By default PaperDatasetLC is initialized with ChromaDB with OpenAI embeddings("text-embedding-ada-002") and text-davinci-003 model but can be initialized with any model and embedding loaded via langchain

In [10]:
from datasets import PaperDatasetLC

dataset = PaperDatasetLC()
# dataset = PaperDatasetLC(db=Chroma(embedding_function=embeddings), llm=model)

  warn_deprecated(
                    collection_metadata was transferred to model_kwargs.
                    Please confirm that collection_metadata is what you intended.
  warn_deprecated(


## Adding documents
Notes:
-  Every added document return id in database on creation, for easier later search
- Optional metadata, by now, only fills missing document metadata items

### Langchain documents
Adding Document class from langchain

In [None]:
# dataset.add_document
# dataset.add_documents

### PDF files

In [11]:
doc_ids = dataset.add_pdf_file("/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf", metadata=None)
doc_ids

['5c6982b0-dbf9-11ee-98e5-0242ac1c000c',
 '5c698472-dbf9-11ee-98e5-0242ac1c000c',
 '5c698508-dbf9-11ee-98e5-0242ac1c000c',
 '5c698580-dbf9-11ee-98e5-0242ac1c000c',
 '5c6985f8-dbf9-11ee-98e5-0242ac1c000c',
 '5c698670-dbf9-11ee-98e5-0242ac1c000c',
 '5c6986e8-dbf9-11ee-98e5-0242ac1c000c',
 '5c698756-dbf9-11ee-98e5-0242ac1c000c',
 '5c6987c4-dbf9-11ee-98e5-0242ac1c000c',
 '5c698832-dbf9-11ee-98e5-0242ac1c000c',
 '5c6988aa-dbf9-11ee-98e5-0242ac1c000c',
 '5c698922-dbf9-11ee-98e5-0242ac1c000c',
 '5c698990-dbf9-11ee-98e5-0242ac1c000c',
 '5c6989fe-dbf9-11ee-98e5-0242ac1c000c',
 '5c698ab2-dbf9-11ee-98e5-0242ac1c000c',
 '5c698b20-dbf9-11ee-98e5-0242ac1c000c']

In [12]:
sample_text = ["Lorem impsum something something", "Some Other Text"]
sample_metas = [{"source": "sth", "v": True}, {"other": "sth"}]


dataset.add_texts(sample_text, sample_metas, skip_invalid=True) # only first text will be added due to missing 'source' value in metadata

Index of problematic record: '1'


['605f4620-dbf9-11ee-98e5-0242ac1c000c']

### Arxiv Papers

In [14]:
print("Added document splits:",
      len(dataset.add_arxiv_by_id(["1812.01187", "2207.02696"])))
#TODO: info about ExtendedArxivRetriever used id paper add

Loading files...: 2it [00:01,  1.09it/s]


Added document splits: 72


In [None]:
    # max_docs: int = 10
    # top_k_results: int = 3
    # sort_docs_by: SortCriterion = SortCriterion.Relevance
    # sort_order: SortOrder = SortOrder.Descending
    # load_all_available_meta: bool = False

    # doc_content_chars_max: Union[int, None] = None
    # ARXIV_MAX_QUERY_LENGTH: Union[int, None] = 300

    # save_pdf: bool = True
    # file_save_dir: str = "."
    # overwrite_existing: bool = False

In [15]:
print("Added document splits:",
      len(dataset.add_arxiv_by_query(query="Yolov7", max_docs=2)))

Loading files...: 2it [00:01,  1.19it/s]


Added document splits: 45


## Listing functions

In [16]:
dataset.unique_list_of_documents()

Listing documents: 100%|██████████| 134/134 [00:00<00:00, 126017.21it/s]


[('',
  '/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf',
  '/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf'),
 ('Lightweight Object Detection: A Study Based on YOLOv7 Integrated with ShuffleNetv2 and Vision Transformer',
  '[http://arxiv.org/abs/2403.01736v1] Lightweight Object Detection: A Study Based on YOLOv7 Integrated with ShuffleNetv2 and Vision Transformer',
  './2403.01736v1.Lightweight_Object_Detection__A_Study_Based_on_YOLOv7_Integrated_with_ShuffleNetv2_and_Vision_Transformer.pdf'),
 ('Bag of Tricks for Image Classification with Convolutional Neural Networks',
  '[http://arxiv.org/abs/1812.01187v2] Bag of Tricks for Image Classification with Convolutional Neural Networks',
  './1812.01187v2.Bag_of_Tricks_for_Image_Classification_with_Convolutional_Neural_Networks.pdf'),
 ('YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors',
  '[http://arxiv.org/abs/2207.02696v1] YOLOv7: Trainable bag-of-freebies sets new

In [17]:
dataset.list_documents_by_id()

[('7209a640-dbf9-11ee-98e5-0242ac1c000c',
  'Lightweight Object Detection: A Study Based on YOLOv7 Integrated with ShuffleNetv2 and Vision Transformer - page: 1, part: 4'),
 ('69f36176-dbf9-11ee-98e5-0242ac1c000c',
  'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 13, part: 34'),
 ('69f361e4-dbf9-11ee-98e5-0242ac1c000c',
  'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 14, part: 35'),
 ('69f3632e-dbf9-11ee-98e5-0242ac1c000c',
  'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 14, part: 36'),
 ('69f363a6-dbf9-11ee-98e5-0242ac1c000c',
  'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 14, part: 37'),
 ('7209a190-dbf9-11ee-98e5-0242ac1c000c',
  'Lightweight Object Detection: A Study Based on YOLOv7 Integrated with ShuffleNetv2 and Vision Transformer - page: 0, part: 0'),
 ('7209a3e8-db

In [18]:
dataset.list_available_fields()

Listing available fields: 100%|██████████| 134/134 [00:00<00:00, 161597.68it/s]


['authors',
 'total_pages',
 'modDate',
 'producer',
 'split_part',
 'source',
 'trapped',
 'date',
 'v',
 'file_path',
 'page',
 'title',
 'subject',
 'summary',
 'published',
 'author',
 'keywords',
 'creationDate',
 'format',
 'creator']

In [20]:
dataset.get_by_id(doc_ids[-1], include=["metadatas", "documents"])

{'ids': ['5c698b20-dbf9-11ee-98e5-0242ac1c000c'],
 'embeddings': None,
 'documents': ['Z. Li, and J. Sun, “YOLOX: exceeding YOLO\nseries in 2021,” CoRR, vol. abs/2107.08430, 2021. III-A, III\n[19] C. Wang, A. Bochkovskiy, and H. M. Liao, “Yolov7: Trainable bag-of-\nfreebies sets new state-of-the-art for real-time object detectors,” CoRR,\nvol. abs/2207.02696, 2022. III\n[20] C. Wang, H. M. Liao, Y. Wu, P. Chen, J. Hsieh, and I. Yeh, “Cspnet: A\nnew backbone that can enhance learning capability of CNN,” in 2020\nIEEE/CVF Conference on Computer Vision and Pattern Recognition,\nCVPR Workshops 2020, Seattle, WA, USA, June 14-19, 2020, pp. 1571–\n1580, Computer Vision Foundation / IEEE, 2020. III-B\n'],
 'metadatas': [{'source': '/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf',
   'file_path': '/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf',
   'page': 4,
   'total_pages': 5,
   'format': 'PDF 1.5',
   'title': '',
   'author': '',
   'subject': '',
   'keywor

In [21]:
dataset.get_containing_field("v",  include=["metadatas"])

{'ids': ['605f4620-dbf9-11ee-98e5-0242ac1c000c'],
 'embeddings': None,
 'documents': None,
 'metadatas': [{'source': 'sth',
   'v': True,
   'title': 'Unknown Text',
   'split_part': 0}]}

## Search Functions

In [22]:
dataset.similarity_search("yolov7", n_results=1)

[Document(page_content='2015, San Diego, CA, USA, May 7-9,\n2015, Conference Track Proceedings (Y. Bengio and Y. LeCun, eds.),\n2015. II-A\n[9] C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. E. Reed, D. Anguelov,\nD. Erhan, V. Vanhoucke, and A. Rabinovich, “Going deeper with\nconvolutions,” in IEEE Conference on Computer Vision and Pattern\nRecognition, CVPR 2015, Boston, MA, USA, June 7-12, 2015, pp. 1–9,\nIEEE Computer Society, 2015. II-A\n[10] H. Cai, C. Gan, T. Wang, Z. Zhang, and S. Han, “Once-for-all: Train one\nnetwork and specialize it for efﬁcient deployment,” in 8th International\nConference on Learning Representations, ICLR 2020, Addis Ababa,\nEthiopia, April 26-30, 2020, OpenReview.net, 2020. II-B\n[11] C. Li, L. Li, H. Jiang, K. Weng, Y. Geng, L. Li, Z. Ke, Q. Li, M. Cheng,\nW. Nie, Y. Li, B. Zhang, Y. Liang, L. Zhou, X. Xu, X. Chu, X. Wei,\nand X. Wei, “Yolov6: A single-stage object detection framework for\nindustrial applications,” CoRR, vol. abs/2209.02976, 2022. III-A\n[1

In [25]:
dataset.similarity_search_with_scores("yolov7", n_results=2, score_threshold=0.45)

[(Document(page_content='2015, San Diego, CA, USA, May 7-9,\n2015, Conference Track Proceedings (Y. Bengio and Y. LeCun, eds.),\n2015. II-A\n[9] C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. E. Reed, D. Anguelov,\nD. Erhan, V. Vanhoucke, and A. Rabinovich, “Going deeper with\nconvolutions,” in IEEE Conference on Computer Vision and Pattern\nRecognition, CVPR 2015, Boston, MA, USA, June 7-12, 2015, pp. 1–9,\nIEEE Computer Society, 2015. II-A\n[10] H. Cai, C. Gan, T. Wang, Z. Zhang, and S. Han, “Once-for-all: Train one\nnetwork and specialize it for efﬁcient deployment,” in 8th International\nConference on Learning Representations, ICLR 2020, Addis Ababa,\nEthiopia, April 26-30, 2020, OpenReview.net, 2020. II-B\n[11] C. Li, L. Li, H. Jiang, K. Weng, Y. Geng, L. Li, Z. Ke, Q. Li, M. Cheng,\nW. Nie, Y. Li, B. Zhang, Y. Liang, L. Zhou, X. Xu, X. Chu, X. Wei,\nand X. Wei, “Yolov6: A single-stage object detection framework for\nindustrial applications,” CoRR, vol. abs/2209.02976, 2022. III-A\n[

In [None]:
dataset.update_document_features(doc_ids[1])
dataset.get_containing_field("new_features", include=["metadatas"])["metadatas"]

Updating metadata: 1it [00:01,  1.96s/it]


[{'source': '/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf',
  'file_path': '/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf',
  'page': 0,
  'total_pages': 5,
  'format': 'PDF 1.5',
  'title': '',
  'author': '',
  'subject': '',
  'keywords': '',
  'creator': 'LaTeX with hyperref',
  'producer': 'pdfTeX-1.40.21',
  'creationDate': 'D:20230202013154Z',
  'modDate': 'D:20230202013154Z',
  'trapped': '',
  'split_part': 1,
  'new_features.model_name': 'RepVGG',
  'new_features.model_category': 'Object Detection',
  'new_features.sota': 0,
  'new_features.new_features': "['3x3 convolutional kernel', 'winograd algorithm']",
  'new_features.new_strategies': "['EfﬁcientRep backbone', 'Rep-PAN neck', 'Bep(Beer-mug) unit', 'BepC3(CSPStack-Rep) block']",
  'new_features.date': '2021',
  'new_features.cls._type': 'ShortInfoSummary',
  'new_features': 'metadata keys [new_features.cls._type, new_features.model_name, new_features.model_category, new_features.sota, new

In [26]:
from langchain.output_parsers import PydanticOutputParser
# from typing import List, Union
# from pydantic import BaseModel, Field
# class ShortInfoSummary(BaseModel):
#     model_name: str = Field(
#         description="Name of the model if provided"
#     )
#     model_category: str = Field(
#         description="Model Category (e.g Object Detection, NLP or image generation)",

#     )
#     sota: int = Field(description="Boolean - Is this model State-of-the-Art?")
#     new_features: Union[List[str],str] = Field(
#         description="Introduced new model components, layers or other features, as keywords, each seperated by commas",
#     )
#     new_strategies: Union[List[str],str]  = Field(
#         description="New strategies introduced, as keywords"
#     )
#     date: str = Field(description="Date of the paper")

from templates import ShortInfoSummary


parser = PydanticOutputParser(pydantic_object=ShortInfoSummary)

In [27]:
dataset.llm_doc_meta_updater(update_key="test", prompt="identify_features", document_ids=doc_ids[0], output_parser=parser)
dataset.get_by_id(doc_ids[0], include=["metadatas"])["metadatas"][0]

Updating metadata: 1it [00:01,  1.66s/it]


{'source': '/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf',
 'file_path': '/content/Arxiv_GPT_Summarizer/sample_documents/2302.00386.pdf',
 'page': 0,
 'total_pages': 5,
 'format': 'PDF 1.5',
 'title': '',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': 'LaTeX with hyperref',
 'producer': 'pdfTeX-1.40.21',
 'creationDate': 'D:20230202013154Z',
 'modDate': 'D:20230202013154Z',
 'trapped': '',
 'split_part': 0,
 'test.model_name': 'EfﬁcientRep',
 'test.model_category': 'Object Detection',
 'test.sota': 0,
 'test.new_features': "['Repvgg-style architecture', 'Hardware-aware neural network design']",
 'test.new_strategies': "['Hardware-aware neural network design']",
 'test.date': '2021',
 'test.cls._type': 'ShortInfoSummary',
 'test': 'metadata keys [test.cls._type, test.model_name, test.model_category, test.sota, test.new_features, test.new_strategies, test.date]'}

In [None]:
from utils import get_document_name

result, source_docs = dataset.llm_search("tell me about yolov7", chain_type="map_reduce", return_source_documents=True)
print('\033[92m', "Answer:", '\033[0m', result, "\n")

print('\033[92m', "Source Documents:", '\033[0m' )
for doc in source_docs:
  print(get_document_name(doc))
# [doc.metadata for doc in source_docs]

[92m Answer: [0m YOLOv7 is a model developed by Chien-Yao Wang and Alexey Bochkovskiy et al. in 2022. It integrates strategies such as E-ELAN, model scaling, and model re-parameterization to achieve a balance between detection efficiency and precision. The network consists of four main modules: Input, Backbone, Head, and Prediction. The Input module uses mosaic and hybrid data enhancement techniques, while the Backbone network includes components like CBS, E-ELAN, and MP1. The Head network utilizes the Feature Pyramid Network architecture. YOLOv7 has shown superior performance in speed and accuracy compared to other object detectors, achieving the highest accuracy of 56.8% AP on test-dev / 56.8% AP min-val among real-time object detectors with 30 FPS or higher on GPU V100. 

[92m Source Documents: [0m
2302.00386.pdf - page: 3, part: 14
Underwater target detection based on improved YOLOv7 - page: 0, part: 5
YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time ob

In [None]:
dataset.search_by_field(
  field_name="source",
  search_value="*pdf",
  regex_match=True,
  include = ["embeddings"])
# summarize_paper

Searching documents: 100%|██████████| 136/136 [00:00<00:00, 164435.09it/s]


{'ids': [], 'embeddings': [], 'documents': None, 'metadatas': None}

In [None]:
[ get_document_name(d) for d in dataset.search_by_name('real-time object detectors', regex_match=True, include=["metadatas"])['metadatas']]

Searching documents: 100%|██████████| 136/136 [00:00<00:00, 127497.84it/s]


['YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 0, part: 37',
 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 0, part: 20',
 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 0, part: 21',
 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 0, part: 22',
 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 0, part: 23',
 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 0, part: 24',
 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 0, part: 25',
 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors - page: 0, part: 26',
 'YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time obje

In [None]:
# search document by part of name
found_docs = dataset.search_by_name("2302.00386", include=["metadatas"], regex_match=True)
for doc_id, doc_meta in zip(found_docs["ids"], found_docs["metadatas"]):
  print(doc_id, " : ", get_document_name(doc_meta))


Searching documents: 100%|██████████| 136/136 [00:00<00:00, 106244.24it/s]

6b4c8e36-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 0, part: 2
6b4c8eb8-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 1, part: 3
6b4c8f3a-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 1, part: 4
6b4c8fb2-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 1, part: 5
6b4c9020-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 1, part: 6
6b4c9098-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 1, part: 7
6b4c9106-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 2, part: 8
6b4c9174-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 2, part: 9
6b4c91ec-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 2, part: 10
6b4c925a-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 2, part: 11
6b4c92c8-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 3, part: 12
6b4c9336-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 3, part: 13
6b4c93a4-d365-11ee-aee1-0242ac1c000c  :  2302.00386.pdf - page: 3, part: 14
6b4c9412-d365-11ee-a




# Inference Examples

## (Optional) Save Paper and summary in Google Drive or download

### Download

In [None]:
from google.colab import files

for p in DOWNLOADED_PAPERS:
  zipfile = os.path.splitext(os.path.basename(p))[0].split('.')[-1] + ".zip"

  if os.path.isfile(zipfile):
    files.download(zipfile)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Move to Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

GOOGLE_DRIVE_OUTPUT_DIRECTORY = "/content/drive/MyDrive/"
for p in DOWNLOADED_PAPERS:
  zipfile = os.path.splitext(os.path.basename(p))[0].split('.')[-1] + ".zip"

  if os.path.isfile(zipfile):
    move(zipfile, os.path.join(GOOGLE_DRIVE_OUTPUT_DIRECTORY, zipfile))
    print(f"Moved {zipfile} to {GOOGLE_DRIVE_OUTPUT_DIRECTORY}")

## (Optional) Download short summary

In [None]:
from google.colab import files

files.download("short_summary.json")