<a href="https://colab.research.google.com/github/martinapugliese/summarise-sci-literature/blob/main/notebooks/Gemini_summarise_AI_literature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# this is to get time execution on each cell
!pip install ipython-autotime

%load_ext autotime

time: 353 µs (started: 2025-02-22 16:12:04 +00:00)


In [2]:
from google.colab import userdata, drive

from pydantic import BaseModel, Field
from inspect import cleandoc

from google import genai
from google.genai import Client, types

import requests
import urllib.request as urllib_req
from bs4 import BeautifulSoup

from datetime import date, datetime

import re

import os
import shutil
import json

import numpy as np

  warn(


time: 3.32 s (started: 2025-02-22 16:12:05 +00:00)


## Configs

In [3]:
client = genai.Client(api_key=userdata.get('GEMINI_API_KEY'))

time: 846 ms (started: 2025-02-22 16:12:10 +00:00)


In [4]:
# configure which Gemini to run
model_id = "gemini-2.0-flash-lite-preview-02-05"
model_id = "gemini-2.0-flash"

time: 517 µs (started: 2025-02-22 16:12:12 +00:00)


## Scrape ArXiv page for latest day's AI papers

Get the paper links and IDs. Use the most recent days of publications available.

In [5]:
webpage = "https://arxiv.org/list/cs.AI/recent?skip=0&show=2000"  # this is the URL for all, so no need to paginate

r = requests.get(webpage)
r.status_code  # you want a 200 here

200

time: 195 ms (started: 2025-02-22 16:12:15 +00:00)


In [6]:
# initialise the parser
soup = BeautifulSoup(r.content, "html.parser")

time: 3.96 s (started: 2025-02-22 16:12:17 +00:00)


In [7]:
# pick the phrasing of the most recent day
latest_day_str = soup.find_all("h3")[0].text

# match what's this latest day
day = latest_day_str.split('(')[0]

# and the total number of entries for that day
match = re.search(r'of \d+ entries', latest_day_str)
if match:
    n_entries = int(match.group().split(' ')[1])
    print(day, ' - ', n_entries, 'papers')
else:
    print("Failed to isolate latest day's info")

Fri, 21 Feb 2025   -  154 papers
time: 30.7 ms (started: 2025-02-22 16:12:21 +00:00)


In [8]:
# now find the URLs to these papers for the latest day only (up to n_entries as per above)
paper_links = soup.find_all("a", {"title": "Download PDF"})[:n_entries]

# Extract the paper IDs and links
paper_ids, paper_urls = [], []
for link in paper_links:
    paper_url = "https://arxiv.org" + link["href"]
    paper_id = link["href"].split("/")[-1].split("v")[0]  # Extract the ID

    paper_ids.append(paper_id)
    paper_urls.append(paper_url)

time: 70.6 ms (started: 2025-02-22 16:12:24 +00:00)


In [9]:
# separately, find all titles (this is due to how the DOM is structured)
# they'll appear in the same order so order counts
paper_title_divs = soup.find_all("div", {"class": "list-title mathjax"})[:n_entries]

paper_titles = []
for title_div in paper_title_divs:
    paper_titles.append(title_div.contents[1].split('\n')[1].lstrip())

time: 75.6 ms (started: 2025-02-22 16:12:27 +00:00)


In [31]:
len(paper_urls), len(paper_ids), len(paper_links), len(paper_titles)

(154, 154, 154, 154)

time: 3.4 ms (started: 2025-02-22 16:35:24 +00:00)


In [32]:
# create json linking ID and URL
paper_metadata = {paper_ids[i]: {'url': paper_urls[i]} for i in range(len(paper_ids))}
json.dump(paper_metadata, open('paper_metadata.json', 'w'))

time: 3.83 ms (started: 2025-02-22 16:35:47 +00:00)


## Download all papers locally

In [18]:
os.mkdir('pdfs')

i = 0
for id_, url_ in zip(paper_ids, paper_urls):
    _ = urllib_req.urlretrieve(url_, f"pdfs/{id_}.pdf")

    i += 1
    if i % 10 == 0:
        print(f"Downloaded {i} papers")

Downloaded 10 papers
Downloaded 20 papers
Downloaded 30 papers
Downloaded 40 papers
Downloaded 50 papers
Downloaded 60 papers
Downloaded 70 papers
Downloaded 80 papers
Downloaded 90 papers
Downloaded 100 papers
Downloaded 110 papers
Downloaded 120 papers
Downloaded 130 papers
Downloaded 140 papers
Downloaded 150 papers
time: 21.9 s (started: 2025-02-22 16:13:39 +00:00)


In [19]:
len(os.listdir('pdfs'))  # just to check count

154

time: 3.99 ms (started: 2025-02-22 16:14:01 +00:00)


## Make Gemini summarise each paper

In [21]:
# define a Pydantic model for the response
class PaperInfo(BaseModel):
    title: str = Field(description="Title of the paper")
    summary: str = Field(description="Summary of the paper, in 3 lines")
    examples: list[str] = Field(description="Relevant examples aiding comprehension, taken from the paper, if there are.")
    category: str = Field(description='Category of the paper')

time: 6.62 ms (started: 2025-02-22 16:14:48 +00:00)


In [22]:
# prompt
sys_instruct = cleandoc(
    """
    You are an experienced reader of academic literature and
    an expert in distilling important findings in a way that is understandable and clear.
    """)

prompt = cleandoc(
    """This is a paper on AI.
    Parse its title, summarise its results, extract examples and produce a category.
    For the summary, be concise and avoid obscure jargon.
    If there are valuable examples that aid understanding, report them in a nutshell.
    For the category, think about what the results refer to, e.g. cognitive science, medicine, foundational AI etc.
    """)

time: 656 µs (started: 2025-02-22 16:14:49 +00:00)


In [23]:
# create a dir for model responses (text)
os.mkdir('responses')

time: 607 µs (started: 2025-02-22 16:14:50 +00:00)


In [24]:
# create some dicts for data & metadata
d_response, d_usage, d_latency = {}, {}, {}

time: 501 µs (started: 2025-02-22 16:14:52 +00:00)


In [25]:
i = 0
for filename in os.listdir('pdfs')[:]:

    print(filename, i)

    # this passes the file as is to Gemini, no need to read its text content first
    file_ = client.files.upload(file=f'pdfs/{filename}')
    id_ = filename.split('.pdf')[0]

    start_time = datetime.now()
    response = client.models.generate_content(
        model=model_id,
        config=types.GenerateContentConfig(
            system_instruction=sys_instruct,
            temperature=0,                       # use greeedy decoding
            response_mime_type='application/json',
            response_schema=PaperInfo
            ),
        contents=[prompt, file_])
    end_time = datetime.now()

    # This is to handle the safety filter if triggered
    if response.prompt_feedback is not None:
        print('This paper failed with feedback: ', response.prompt_feedback)
    else:
        d_response[id_] = json.loads(response.text)
        d_usage[id_] = {
            'prompt_token_count': response.usage_metadata.prompt_token_count,
            'candidates_token_count': response.usage_metadata.candidates_token_count,
            'cached_content_token_count': response.usage_metadata.cached_content_token_count}
        d_latency[id_] = (end_time - start_time).total_seconds()

        # create file of JSON response
        json.dump(d_response[id_], open(f'responses/{id_}.json', 'w'))

        # also dump usage and latency at each execution
        json.dump(d_usage, open(f'usage.json', 'w'))
        json.dump(d_latency, open(f'latency.json', 'w'))

    i += 1

2502.14416.pdf 0
2502.13969.pdf 1
2502.14074.pdf 2
2502.14000.pdf 3
2502.14361.pdf 4
2502.14581.pdf 5
2502.14276.pdf 6
2502.14043.pdf 7
2502.14191.pdf 8
2502.14499.pdf 9
2502.14456.pdf 10
2502.14645.pdf 11
2502.14037.pdf 12
2502.14838.pdf 13
2502.14080.pdf 14
2502.14563.pdf 15
2502.14765.pdf 16
2502.14558.pdf 17
2502.14553.pdf 18
2502.14807.pdf 19
2502.14281.pdf 20
2502.13994.pdf 21
2502.14442.pdf 22
2502.14334.pdf 23
2502.14176.pdf 24
2502.14457.pdf 25
2502.14247.pdf 26
2502.14777.pdf 27
2502.14760.pdf 28
2502.14070.pdf 29
2502.14260.pdf 30
2502.14293.pdf 31
2502.14486.pdf 32
2502.14525.pdf 33
2502.14197.pdf 34
2502.13983.pdf 35
2502.14132.pdf 36
2502.14572.pdf 37
2502.14834.pdf 38
2502.14619.pdf 39
2502.13991.pdf 40
2502.14698.pdf 41
2502.14487.pdf 42
2502.14380.pdf 43
2502.14218.pdf 44
2502.14113.pdf 45
2502.14768.pdf 46
2502.14001.pdf 47
2502.14856.pdf 48
2502.14318.pdf 49
2502.14704.pdf 50
2502.14458.pdf 51
2502.14183.pdf 52
2502.14753.pdf 53
2502.14010.pdf 54
2502.14272.pdf 55
25

## Run some stats

In [26]:
# num papers summarised
print('Summaries for ', day)
print('Num papers published: ', n_entries)
print('Num papers summarised: ', len(os.listdir('responses')))
print('Median input/output tokens',
      np.percentile([d_usage[k]['prompt_token_count'] for k in d_usage.keys()], 50),
      np.percentile([d_usage[k]['candidates_token_count'] for k in d_usage.keys()], 50))
print('Median/P90 latency per paper: ',
      np.percentile([d_latency[k] for k in d_usage.keys()], 50),
      np.percentile([d_latency[k] for k in d_usage.keys()], 90))

Summaries for  Fri, 21 Feb 2025 
Num papers published:  154
Num papers summarised:  154
Median input/output tokens 4271.0 198.5
Median/P90 latency per paper:  5.4023425 9.537124200000003
time: 10.3 ms (started: 2025-02-22 16:33:17 +00:00)


## Create HTML page with all summaries

In [33]:
# this part below was contributed by Gemini after a prompt!
# prompt: Create HTML document listing all texts in folder response one after the other. Use the data in paper_metadata to create titles for each entry and an href for the link

# Load paper metadata
#paper_metadata = json.load(open('paper_metadata.json', 'r'))

# Create HTML content
html_content = """<!DOCTYPE html>
<html>
<head>
<title>Paper Summaries</title>
</head>
<body>
"""

for filename in os.listdir('responses'):
    if '.ipynb' not in filename: # it may create this folder
        paper_id = filename.split('.json')[0]
        d_paper = json.load(open(os.path.join('responses', filename), 'r'))

        if paper_id in paper_metadata:
            title = d_paper['title']
            url = paper_metadata[paper_id]['url']
            summary = d_paper['summary']
            examples = d_paper['examples']
            category = d_paper['category']
            html_content += f"<h1><a href='{url}'>{title}</a></h1>\n"
            html_content += f"<p>{summary}</p>\n<hr>\n"
            html_content += f"<p>Examples: {examples}</p>\n<hr>\n"
            html_content += f"<p>Category: {category}</p>\n<hr>\n"

html_content += """</body>
</html>"""

# Write HTML to file
with open('paper_summaries.html', 'w') as f:
    f.write(html_content)


time: 22.3 ms (started: 2025-02-22 16:36:05 +00:00)


## Save data to GDrive

Save zipped data, cp it to Drive timestamped with day papers refer to.

In [36]:
# zip responses folder and all JSON and HTML files
!zip -r data.zip responses *.json *.html

updating: responses/ (stored 0%)
updating: responses/2502.14572.json (deflated 43%)
updating: responses/2502.14361.json (deflated 43%)
updating: responses/2502.14837.json (deflated 37%)
updating: responses/2502.14048.json (deflated 49%)
updating: responses/2502.14302.json (deflated 52%)
updating: responses/2502.13969.json (deflated 44%)
updating: responses/2502.14086.json (deflated 45%)
updating: responses/2502.14560.json (deflated 50%)
updating: responses/2502.14001.json (deflated 48%)
updating: responses/2502.14400.json (deflated 46%)
updating: responses/2502.14280.json (deflated 44%)
updating: responses/2502.14499.json (deflated 45%)
updating: responses/2502.14768.json (deflated 45%)
updating: responses/2502.14457.json (deflated 45%)
updating: responses/2502.14862.json (deflated 45%)
updating: responses/2502.14264.json (deflated 47%)
updating: responses/2502.14276.json (deflated 43%)
updating: responses/2502.14281.json (deflated 48%)
updating: responses/2502.13983.json (deflated 46%

In [37]:
# needs to mount the Drive first
drive.mount('/content/drive')

day_object = datetime.strptime(day, "%a, %d %b %Y ")
formatted_day = day_object.strftime("%Y-%m-%d")

# cp and rename
!cp *.zip /content/drive/MyDrive/
!mv /content/drive/MyDrive/data.zip /content/drive/MyDrive/{formatted_day}_data_papers.zip

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
time: 1.6 s (started: 2025-02-22 16:40:29 +00:00)


In [20]:
# you could also send that summary HTML via email if you want
# or publish it somewhere
# I may just keep it as is for now while I test this for a few days
# currently testing how categories get created

# TODOs
# org/univ it came from - maybe if acedmic or not too
# these^ to be shown in html and put together by theme, with TOC at top
# check that zipped file is actually permanend in drive (not sure)

time: 450 µs (started: 2025-02-22 16:14:41 +00:00)
