In [None]:
!pip install openai==0.27.6
!pip install PyPDF2==3.0.1
!pip install arxiv==1.4.7
!pip install pyyaml==6.0

from IPython.display import clear_output
clear_output()
print("Libraries Installed")

Libraries Installed


# Libraries and Functions

In [None]:
import arxiv 
import openai
from PyPDF2 import PdfReader
from tqdm import tqdm
import yaml

from dataclasses import dataclass, field
from itertools import islice
import json
import os
from shutil import move
from time import sleep
from typing import Union, List, Literal, TypedDict

In [None]:
with open('openai_key.yaml', 'r') as f:
  # Read API KEY for ChatGPT
  openai.api_key = yaml.safe_load(f)["openai_api_key"]

assert openai.api_key and openai.api_key != "YOUR_SECRET_KEY_HERE", \
 "OpenAI API key not provided - check openai_key.yaml"

with open('prompts.yaml', 'r') as f:
  # Read prompts that will be used to make summaries
  PROMPTS = yaml.safe_load(f)

In [None]:
def get_completion(prompt: str, model="gpt-3.5-turbo"):
    """ Get Completion from OpenAI model for given prompt
    
    Parameters
    ----------
    prompt: str
        Prompt (Instruction) for model
    model:
        OpenAI model to use
        
    Returns
    -------
    String with response for prompt

    """
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [None]:
def download_paper_from_arxiv(id_list: List[str]) -> List[str]:
    """ Download research papers from Arxiv by paper id

    Parameters
    ----------
    id_list: List[str]
        List with ids for each paper to download
        
    Returns
    -------
    papers_downloaded: List[str]
        List of paths for each downloaded file

    """
    papers_downloaded = []
    search = arxiv.Search(id_list=id_list)
    
    for paper in tqdm(search.results(), desc="Downloading files..."):
        file_path = paper.download_pdf()

        if os.path.isfile(file_path):
          papers_downloaded.append(file_path)
    
    return papers_downloaded

In [None]:
def download_recent_papers_by_querry(querry: str, limit: float = 10.0) -> List[str]:
    """ Download research papers from Arxiv by result of search querry

    Parameters
    ----------
    querry: str
        Search querry (e.g. 'Deep Learning')
    
    limit: float
        Maximum number of papers to download.
        
    Returns
    -------
    papers_downloaded: List[str]
        List of paths for each downloaded file

    """
    papers_downloaded = []
    search = arxiv.Search(
      query = querry,
      max_results = limit,
      sort_by = arxiv.SortCriterion.SubmittedDate)

    for paper in tqdm(search.results(), desc="Downloading files..."):
      file_path = paper.download_pdf()
      papers_downloaded.append(file_path)

    return papers_downloaded

In [None]:
@dataclass(frozen=True)
class PageData:
    """ Container for page(text) data

    Parameters
    ----------
    text : str
        Text of the page

    Attributes
    ----------
    words_count : int
        Number of words in 'text' parameter.
        .. note:: Used as aprroximate number of tokens for chatGPT

    """
    text: str

    def __add__(self, other_page):
        # Create concatenated page with other page or text
        if isinstance(other_page, str):
            return self.__class__(self.text + ' \n' + other_page)

        return self.__class__(self.text + ' \n' + other_page.text)

    def __repr__(self) -> str:
        return f"PageData(number of words: {self.words_count})"

    @property
    def words_count(self) -> int:
        return len(self.text.split())

In [None]:
@dataclass
class PaperData:
    """ Container for research paper's data (pages with text and filepath to paper)

    Parameters
    ----------
    filepath : str
        Path to PDF file with research paper

    """
    filepath: str
    _pages: List[PageData] = field(init=False)

    def __post_init__(self):
        reader = PdfReader(self.filepath)
        self._pages = [PageData(page.extract_text())
                       for page in reader.pages]

    def __getitem__(self, i):
        # Access each page by index
        return self._pages[i]

    def __repr__(self) -> str:
        return f"PaperData(file: '{self.filepath}', number of pages: {len(self._pages)})"

    def join_pages_by_length(self, max_words: int = 1100) -> List[PageData]:
        """ Concatenate pages by number of maximum words to contain.
        
        .. note:: Used as token limiter for prompts filling

        Parameters
        ----------
        max_words: int
            Maximum number of words allowed per new page, obtained from join

        Returns
        -------
        joined_pages: List[PageData]
            List of pages, after concatenation


        """
        joined_pages = []
        last_join_page = None

        for i, page in enumerate(self._pages):
            if last_join_page is not None:

                if last_join_page.words_count + page.words_count <= max_words:
                    # Create joined version of pages
                    last_join_page = last_join_page + page

                else:
                    # Last candidate is too long to add more pages - add it to list and go further
                    joined_pages.append(last_join_page)
                    last_join_page = page

            else:
                # last_join_page not exist yet - use current page
                last_join_page = page

            if i == (len(self._pages) - 1):
                # Fill list with last element
                joined_pages.append(last_join_page)

        return joined_pages
      

In [None]:
def create_batches(iterable, batch_size: int):
    """ Split iterable into smaller batches"""
    iterator = iter(iterable)
    while batch := list(islice(iterator, batch_size)):
        yield batch

In [None]:


def get_summary(text: Union[str, PaperData], max_tokens_per_prompt: int = 1200) -> str:
    """ Obtain Summary for research paper

    Parameters
    ----------
    text: Union[str, PaperData]
        String or PaperData object containing text to summarize
    max_tokens_per_prompt: int
        Limit of words feeded to prompt, to not exceed model's number of maximum tokens.

    Returns
    -------
    response: str
        Response from model, containing created summary


    """
    text_batches = []

    if isinstance(text, str):
        # Create batches from text, if it's too long
        text_batches = [' '.join(word) for word in \
                        create_batches(text.split(' '), max_tokens_per_prompt)]
    else:
        # Make batches from pages by joining limited with maximum number of words
        text_batches = [t.text for t in text.join_pages_by_length(max_tokens_per_prompt)]
    
    base_prompt = f"""{PROMPTS["summary"]} ''''{text_batches[0]}'''"""

    #   base_prompt = f"""
    # Identify the following items from given text, delimited by triple backticks:
    # - New Features: (listed each of names of new features, functions and functionalities)
    # - New Stategies: (listed new stategies and techniques)
    # - Problems: (listed tackled problems and approaches)
    # - Design: (network design)
    # - maximum of three sentences for obtained results. 

    # Format your response to pointed answears for each category.
    # For each feature in identified new features write on the end few sentences of summary

    # Review text: '''{text_batches[0]}'''
    # """

    response = get_completion(base_prompt)
    return response

In [None]:
def get_summary_3(text: Union[str, PaperData], max_tokens_per_prompt: int = 1200) -> str:
    """ Obtain Summary for research paper

    Parameters
    ----------
    text: Union[str, PaperData]
        String or PaperData object containing text to summarize
    max_tokens_per_prompt: int
        Limit of words feeded to prompt, to not exceed model's number of maximum tokens.

    Returns
    -------
    response: str
        Response from model, containing created summary


    """
    text_batches = []

    if isinstance(text, str):
        # Create batches from text, if it's too long
        text_batches = [' '.join(word) for word in \
                        create_batches(text.split(' '), max_tokens_per_prompt)]
    else:
        # Make batches from pages by joining limited with maximum number of words
        text_batches = [t.text for t in text.join_pages_by_length(max_tokens_per_prompt)]
    
    # base_prompt = f"""{PROMPTS["summary"]} ''''{text_batches[0]}'''"""

    # system_prompt = f""" Your task is to make summary for research paper. 
    # You have to identify the following items from given text, delimited by triple backticks:
    
    # - New Features: (listed each of names of new features, functions and functionalities)
    # - New Stategies: (listed new stategies and techniques)
    # - Problems: (listed tackled problems and approaches)
    # - Design: (network design)
    # - maximum of three sentences for obtained results. 

    # Format your response to pointed answears for each category.
    # For each feature in identified new features write on the end few sentences of summary

    # With each next text you have to fill your previous response with missing informations"""


    base_prompt = f"""{PROMPTS["summary"]} ''''{text_batches[0]}'''"""

    responses = [get_completion(base_prompt)]
    prompts_used = 1 # Counter for ensuring not exceeding limit rate of GPT (3 prompts / min)

    for text_batch in tqdm(text_batches[1:]):
      if prompts_used % 4 == 0:
        sleep(60)
      
      continue_prompt = f"""
      You have to identify the following items from given text, delimited by triple backticks:
      - New Features: (listed every name for new features, functions and functionalities and corresponding components)
      - New Stategies: (listed new stategies and techniques)
      - Problems: (listed tackled problems and approaches)
      - Design: (network design)
      
      After that fill the summary in quotes with missing informations.
      text: '''{text_batch}'''

      summary to fill: {responses[-1]}
      
      """
      responses.append(get_completion(base_prompt))
      #- New Features: (listed each of names of new features, functions and functionalities)
      #       - maximum of three sentences for obtained results. 
      prompts_used += 1
      
    return responses

In [None]:
def get_description_json(text: Union[str, PaperData], max_tokens: int = 1100) -> dict:
    """ Get short description of paper in JSON format

    Parameters
    ----------
    text: str
        Text of the page
        .. warning:: If text contain more than 1100 words will be trimmed to that count

    max_tokens: int
        Maximum number of tokens(words) from paper used in prompt

    Returns
    -------

    JSON (as dict) with short description of paper

    """
    # fit text into max token ammount
    if isinstance(text, str):
        # Trim text to fit prompt
        text_to_describe = [' '.join(word) for word in text.split(' ')[:max_tokens]]
    else:
        # Attempt to concatenate pages with maximum length to fit more of them into prompt
        text_to_describe = text.join_pages_by_length(max_tokens)[0].text

    prompt = f"""{PROMPTS["short_decription_json"]} '''{text_to_describe}'''"""

    # prompt = f"""
    # Identify the following items from given text, delimited by triple backticks:
    # - Model Name
    # - Model category(e.g Object Detection, NLP or image generation)
    # - SOTA: if Model is State-of-the-Art
    # - New Features: new features introduced
    # - Year: Year of publish

    # Format your response as a JSON object with \
    # "Model Name", "Model Category", "SOTA", "New Features" and "Year" as the keys.
    # If the information isn't present, use "unknown" \
    # as the value.
    # Make your response as short as possible.
    # Format the SOTA value as a boolean.

    # Review text: '''{text_to_describe}'''
    # """

    response = get_completion(prompt)

    return json.loads(response)


# Define Files to summary (Choose option)

In [None]:
#@markdown Specify settings for downloading papers by search querry
#@markdown ---
#@markdown Note: This will download most recent papers found with querry
SEARCH_QUERRY = "Artificial Inteligence" #@param {type:"string"}
assert SEARCH_QUERRY, "Empty search quarry"
#@markdown ---
#@markdown Number of papers to download
#@markdown ---

NUMBER_OF_PAPERS = 3 #@param {type:"integer"}
assert NUMBER_OF_PAPERS > 0
#@markdown <br>

DOWNLOADED_PAPERS = download_recent_papers_by_querry(SEARCH_QUERRY, NUMBER_OF_PAPERS)
print("\nDownloaded papers:")
for p in DOWNLOADED_PAPERS:
  print(f"\t{p}")

Downloading files...: 3it [00:03,  1.27s/it]


Downloaded papers:
	./2305.05666v1.Policy_Gradient_Methods_in_the_Presence_of_Symmetries_and_State_Abstractions.pdf
	./2305.05665v1.ImageBind_One_Embedding_Space_To_Bind_Them_All.pdf
	./2305.05661v1.ShapeCoder_Discovering_Abstractions_for_Visual_Programs_from_Unstructured_Primitives.pdf





<h3><b>or</b></h3>

In [None]:
#@markdown Provide papers to download by Arxiv ID
#@markdown ---
#@markdown Each paper id must be separated by space (' ') or comma (',')
PAPERS_BY_ID = "2301.05586, 2305.04889" #@param {type:"string"}

PAPERS_BY_ID = PAPERS_BY_ID.replace(' ', ',').split(',')
PAPERS_BY_ID = [p for p in PAPERS_BY_ID if p]
DOWNLOADED_PAPERS = download_paper_from_arxiv(PAPERS_BY_ID)
print("\nDownloaded papers:")
for p in DOWNLOADED_PAPERS:
  print(f"\t{p}")

Downloading files...: 2it [00:01,  1.19it/s]


Downloaded papers:
	./2301.05586v1.YOLOv6_v3_0_A_Full_Scale_Reloading.pdf
	./2305.04889v1.Improving_Real_Time_Bidding_in_Online_Advertising_Using_Markov_Decision_Processes_and_Machine_Learning_Techniques.pdf





# Inference Examples

## Single file example usage

In [None]:
download_paper_from_arxiv(["2301.05586v1"])

In [None]:
paper = PaperData("./2301.05586v1.YOLOv6_v3_0_A_Full_Scale_Reloading.pdf")
get_description_json(paper)

In [None]:
paper = PaperData("./2301.05586v1.YOLOv6_v3_0_A_Full_Scale_Reloading.pdf")
summary = get_summary_3(paper)
print(summary[0])

100%|██████████| 5/5 [04:20<00:00, 52.07s/it]

New Features:
- YOLOv6 v3.0 has numerous novel enhancements on the network architecture and the training scheme.
- The neck of the detector is renewed with a Bi-directional Concatenation (BiC) module to provide more accurate localization signals.
- Anchor-aided training (AAT) strategy is proposed to enjoy the advantages of both anchor-based and anchor-free paradigms without touching inference efficiency.
- YOLOv6 is deepened to have another stage in the backbone and the neck, which reinforces it to hit a new state-of-the-art performance on the COCO dataset at a high-resolution input.
- A new self-distillation strategy is involved to boost the performance of small models of YOLOv6.

New Strategies:
- Anchor-aided training (AAT) strategy is proposed to enjoy the advantages of both anchor-based and anchor-free paradigms without touching inference efficiency.
- A new self-distillation strategy is involved to boost the performance of small models of YOLOv6.

Problems:
- The YOLO community h




## Make List with short Summaries

In [None]:
summaries = []

for i, p in enumerate(tqdm(DOWNLOADED_PAPERS)):
  try:
    if i > 0 and i % 4 == 0:
      sleep(60) # Sleep 60 s to not exceed rate of prompts (3 prompts / min)

    # make summary for each paper
    paper = PaperData(p)
    desc = get_description_json(paper)
    desc["filename"] = os.path.basename(p)
    summaries.append(desc)
  
  except Exception as err:
    print(f"""Failed to summarize: {p}
            - {err}""")

100%|██████████| 3/3 [00:13<00:00,  4.39s/it]


In [None]:
with open("short_summary.json", 'w') as f:
  json.dump(summaries, f, indent=2)

## Make Packages with longer summary

In [None]:
from shutil import move

for i, p in enumerate(tqdm(DOWNLOADED_PAPERS[:1])):
  if i > 0 and i % 4 == 0:
    sleep(60) # Sleep 60 s to not exceed rate of prompts (3 prompts / min)
  
  try:
    paper = PaperData(p)
    summary = get_summary_3(paper)

    paper_dir = os.path.splitext(os.path.basename(p))[0].split('.')[-1]
    if not os.path.isdir(paper_dir):
      os.makedirs(paper_dir)
    
    move(p, f"{paper_dir}/{os.path.basename(p)}")
    with open(f"{paper_dir}/summary.txt", 'w') as f:
      f.write(summary[-1])
      
  except Exception as err:
    print(f"""Failed to summarize: {p}
            - {err}""")


  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:11<00:45, 11.44s/it][A
 40%|████      | 2/5 [00:21<00:31, 10.49s/it][A
 60%|██████    | 3/5 [00:34<00:23, 11.70s/it][A
 80%|████████  | 4/5 [01:44<00:34, 34.64s/it][A
100%|██████████| 5/5 [01:57<00:00, 23.54s/it]
100%|██████████| 1/1 [02:07<00:00, 127.43s/it]


In [None]:
for p in DOWNLOADED_PAPERS:
  paper_directory = os.path.splitext(os.path.basename(p))[0].split('.')[-1]
  if os.path.isdir(paper_directory):
    # Shell commands sometimes don't work with 'f" '
    zipname = paper_directory + ".zip"
    !zip -rm $zipname $paper_directory 

  adding: YOLOv6_v3_0_A_Full_Scale_Reloading/ (stored 0%)
  adding: YOLOv6_v3_0_A_Full_Scale_Reloading/.ipynb_checkpoints/ (stored 0%)
  adding: YOLOv6_v3_0_A_Full_Scale_Reloading/2301.05586v1.YOLOv6_v3_0_A_Full_Scale_Reloading.pdf (deflated 17%)
  adding: YOLOv6_v3_0_A_Full_Scale_Reloading/summary.txt (deflated 56%)


## (Optional) Save Paper and summary in Google Drive or download

### Download

In [None]:
from google.colab import files

for p in DOWNLOADED_PAPERS:
  zipfile = os.path.splitext(os.path.basename(p))[0].split('.')[-1] + ".zip"

  if os.path.isfile(zipfile):
    files.download(zipfile) 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Move to Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

GOOGLE_DRIVE_OUTPUT_DIRECTORY = "/content/drive/MyDrive/"
for p in DOWNLOADED_PAPERS:
  zipfile = os.path.splitext(os.path.basename(p))[0].split('.')[-1] + ".zip"

  if os.path.isfile(zipfile):
    move(zipfile, os.path.join(GOOGLE_DRIVE_OUTPUT_DIRECTORY, zipfile))
    print(f"Moved {zipfile} to {GOOGLE_DRIVE_OUTPUT_DIRECTORY}")

## (Optional) Download short summary

In [None]:
from google.colab import files

files.download("short_summary.json") 