<a href="https://colab.research.google.com/github/martinapugliese/summarise-sci-literature/blob/main/Gemini_summarise_AI_literature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
# this is to get time execution on each cell
!pip install ipython-autotime

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 3.93 s (started: 2025-02-15 11:39:26 +00:00)


In [68]:
from google.colab import userdata

from google import genai
from google.genai import Client, types

import requests
import urllib.request as urllib_req

from bs4 import BeautifulSoup

from datetime import date, datetime

import re

import os
import json

import numpy as np

time: 694 µs (started: 2025-02-15 12:39:00 +00:00)


## Configs

In [2]:
client = genai.Client(api_key=userdata.get('GEMINI_API_KEY'))

In [3]:
# configure which Gemini to run
model = "gemini-2.0-flash-lite-preview-02-05"
model = "gemini-2.0-flash"

## Scrape ArXiv page for latest day's AI papers

Get the paper links and IDs. Use the most recent days of publications available.

In [9]:
webpage = "https://arxiv.org/list/cs.AI/recent?skip=0&show=2000"  # this is the URL for all, so no need to paginate

r = requests.get(webpage)
r.status_code  # you want a 200 here

200

In [10]:
# initialise the parser
soup = BeautifulSoup(r.content, "html.parser")

In [11]:
# pick the phrasing of the most recent day
latest_day_str = soup.find_all("h3")[0].text

# match what's this latest day
day = latest_day_str.split('(')[0]

# and the total number of entries for that day
match = re.search(r'of \d+ entries', latest_day_str)
if match:
    n_entries = int(match.group().split(' ')[1])
    print(day, ' - ', n_entries, 'papers')
else:
    print("Failed to isolate latest day's info")

Fri, 14 Feb 2025   -  159 papers


In [22]:
# now find the URLs to these papers for the latest day only (up to n_entries as per above)
paper_links = soup.find_all("a", {"title": "Download PDF"})[:n_entries]

# Extract the paper IDs and links
paper_ids, paper_urls = [], []
for link in paper_links:
    paper_url = "https://arxiv.org" + link["href"]
    paper_id = link["href"].split("/")[-1].split("v")[0]  # Extract the ID

    paper_ids.append(paper_id)
    paper_urls.append(paper_url)

# # Print the IDs and links (or process them as needed)
# for i in range(len(paper_ids)):
#     print(f"Paper ID: {paper_ids[i]}, URL: {paper_urls[i]}")

In [103]:
# separately, find all titles (this is due to how the DOM is structured)
# they'll appear in the same order so order counts
paper_title_divs = soup.find_all("div", {"class": "list-title mathjax"})[:n_entries]

paper_titles = []
for title_div in paper_title_divs:
    paper_titles.append(title_div.contents[1].split('\n')[1].lstrip())

time: 39.8 ms (started: 2025-02-15 14:44:08 +00:00)


In [104]:
len(paper_urls), len(paper_ids), len(paper_links), len(paper_titles)

(159, 159, 159, 159)

time: 3.91 ms (started: 2025-02-15 14:44:23 +00:00)


In [121]:
# create json linking ID, title and URL
paper_metadata = {paper_ids[i]: {'title': paper_titles[i], 'url': paper_urls[i]} for i in range(len(paper_ids))}
json.dump(paper_metadata, open('paper_metadata.json', 'w'))

time: 4.57 ms (started: 2025-02-15 14:55:42 +00:00)


## Download all papers locally

In [47]:
os.mkdir('pdfs')

i = 0
for id_, url_ in zip(paper_ids, paper_urls):
    _ = urllib_req.urlretrieve(url_, f"pdfs/{id_}.pdf")

    i += 1
    if i % 10 == 0:
        print(f"Downloaded {i} papers")

Downloaded 10 papers
Downloaded 20 papers
Downloaded 30 papers
Downloaded 40 papers
Downloaded 50 papers
Downloaded 60 papers
Downloaded 70 papers
Downloaded 80 papers
Downloaded 90 papers
Downloaded 100 papers
Downloaded 110 papers
Downloaded 120 papers
Downloaded 130 papers
Downloaded 140 papers
Downloaded 150 papers
time: 22.2 s (started: 2025-02-15 12:07:20 +00:00)


In [51]:
len(os.listdir('pdfs'))  # just to check count

159

time: 3.52 ms (started: 2025-02-15 12:08:13 +00:00)


## Make Gemini summarise each paper

In [52]:
# prompt
sys_instruct = """
                You are an experienced reader of academic literature and
                an expert in distilling important findings in a way that is understandable and clear.
               """

prompt = """This is a paper on AI.
            Summarise its results in 3 lines, avoiding obscure jargon and going to the point.
            If there are valuable examples that aid understanding, report them in a nutshell.
            """

time: 359 µs (started: 2025-02-15 12:08:15 +00:00)


In [111]:
# create a dir for model responses (text)
os.mkdir('responses')

time: 733 µs (started: 2025-02-15 14:50:28 +00:00)


In [61]:
# create some dicts for metadata
d_usage, d_latency = {}, {}

time: 546 µs (started: 2025-02-15 12:15:30 +00:00)


In [115]:
i = 0
for filename in os.listdir('pdfs')[:3]:

    print(filename, i)

    # this passes the file as is to Gemini, no need to read its text content first
    file_ = client.files.upload(file=f'pdfs/{filename}')
    start_time = datetime.now()
    response = client.models.generate_content(
        model=model,
        config=types.GenerateContentConfig(system_instruction=sys_instruct),
        contents=[prompt, file_])
    end_time = datetime.now()

    id_ = filename.split('.pdf')[0]
    d_usage[id_] = {
        'prompt_token_count': response.usage_metadata.prompt_token_count,
        'candidates_token_count': response.usage_metadata.candidates_token_count,
        'cached_content_token_count': response.usage_metadata.cached_content_token_count}
    d_latency[id_] = (end_time - start_time).total_seconds()

    # create file of text response
    with open(f'responses/{id_}.txt', 'w') as f:
        f.write(response.text)

    # also dump usage and latency at each execution
    json.dump(d_usage, open(f'usage.json', 'w'))
    json.dump(d_latency, open(f'latency.json', 'w'))

    i += 1

2502.08689.pdf 0
2502.08821.pdf 1
2502.09051.pdf 2
time: 17.1 s (started: 2025-02-15 14:51:11 +00:00)


## Run some stats

In [117]:
# num papers summarised
print('Summaries for ', day)
print('Num papers published: ', n_entries)
print('Num papers summarised: ', len(os.listdir('responses')))
print('Median input/output tokens',
      np.percentile([d_usage[k]['prompt_token_count'] for k in d_usage.keys()], 50),
      np.percentile([d_usage[k]['candidates_token_count'] for k in d_usage.keys()], 50))
print('Median/P90 latency per paper: ',
      np.percentile([d_latency[k] for k in d_usage.keys()], 50),
      np.percentile([d_latency[k] for k in d_usage.keys()], 90))

Summaries for  Fri, 14 Feb 2025 
Num papers published:  159
Num papers summarised:  159
Median input/output tokens 3691.0 130.0
Median/P90 latency per paper:  4.945586 8.389815600000006
time: 10.6 ms (started: 2025-02-15 14:52:24 +00:00)


In [75]:
# zip all responses
!zip -r responses.zip responses

  adding: responses/ (stored 0%)
  adding: responses/2502.09233.pdf.txt (deflated 41%)
  adding: responses/2502.09218.pdf.txt (deflated 41%)
  adding: responses/2502.08652.pdf.txt (deflated 41%)
  adding: responses/2502.09387.pdf.txt (deflated 45%)
  adding: responses/2502.08828.pdf.txt (deflated 43%)
  adding: responses/2502.09051.pdf.txt (deflated 39%)
  adding: responses/2502.09460.pdf.txt (deflated 43%)
  adding: responses/2502.09211.pdf.txt (deflated 48%)
  adding: responses/2502.08920.pdf.txt (deflated 44%)
  adding: responses/2502.09307.pdf.txt (deflated 43%)
  adding: responses/2502.09183.pdf.txt (deflated 46%)
  adding: responses/2502.09604.pdf.txt (deflated 46%)
  adding: responses/2502.09365.pdf.txt (deflated 42%)
  adding: responses/2502.08884.pdf.txt (deflated 45%)
  adding: responses/2502.09601.pdf.txt (deflated 43%)
  adding: responses/2502.09038.pdf.txt (deflated 44%)
  adding: responses/2502.08916.pdf.txt (deflated 43%)
  adding: responses/2502.09247.pdf.txt (deflated 

## Create HTML page with all summaries

In [80]:
paper_urls[:3]
paper_metadata = json.load(open('paper_metadata.json', 'r'))

['https://arxiv.org/pdf/2502.09601',
 'https://arxiv.org/pdf/2502.09596',
 'https://arxiv.org/pdf/2502.09565']

time: 29.1 ms (started: 2025-02-15 14:27:14 +00:00)


In [122]:
# this part below was contributed by Gemini after a prompt!
# prompt: Create HTML document listing all texts in folder response one after the other. Use the data in paper_metadata to create titles for each entry and an href for the link

# Load paper metadata
paper_metadata = json.load(open('paper_metadata.json', 'r'))

# Create HTML content
html_content = """<!DOCTYPE html>
<html>
<head>
<title>Paper Summaries</title>
</head>
<body>
"""

for filename in os.listdir('responses'):
    paper_id = filename.split('.txt')[0]
    if paper_id in paper_metadata:
      title = paper_metadata[paper_id]['title']
      url = paper_metadata[paper_id]['url']
      with open(os.path.join('responses', filename), 'r') as f:
          summary = f.read()
          html_content += f"<h1><a href='{url}'>{title}</a></h1>\n"
          html_content += f"<p>{summary}</p>\n<hr>\n"

html_content += """</body>
</html>"""

# Write HTML to file
with open('paper_summaries.html', 'w') as f:
    f.write(html_content)


time: 12.6 ms (started: 2025-02-15 14:56:22 +00:00)


In [123]:
# now you could send that summary HTML via email if you want
# or publish it somewhere
# I may just keep it as is for now while I test this for a few days

# the whole job could be made

time: 314 µs (started: 2025-02-15 14:57:50 +00:00)
