In [100]:
import time
import json
from openai import OpenAI
from dotenv import dotenv_values

In [101]:
env = dotenv_values()

API_KEY = env['OPENAI_API_KEY']
MODEL = env['OPENAI_MODEL']

filename = "01_summaries.json"

In [102]:
client = OpenAI(api_key=API_KEY)

# Pipeline

In [94]:
# Create an assistant

assistant = client.beta.assistants.create(
    name="asst_bachelor_summaries",
    instructions="You are an expert document summarization assistant that creates summaries of provided scholarly PDF documents..",
    model=MODEL,
    tools=[{"type": "file_search"}],
    temperature=0.7
)

In [95]:
# Main summary function to get the summary of a document

prompt_content = "Summarize the provided document into a single paragraph of about 350 to 450 tokens (1400-1800 characters), in the primary language that is used the most throughout the document. Do not use any numbered lists or bullet points, do not cite sources. Return just plain text. Talk only about the document contents, not the document itself or its structure."


def get_summary(file):
    file = client.files.create(
        file=file,
        purpose="assistants",
    )

    vector_store = client.beta.vector_stores.create(
        file_ids=[file.id],
    )

    vs_in_progress = vector_store.status == "in_progress"
    vs_files_in_progress = vector_store.file_counts.in_progress > 0

    while vs_in_progress or vs_files_in_progress:
        time.sleep(5)
        vector_store = client.beta.vector_stores.retrieve(vector_store.id)

    thread = client.beta.threads.create(
        tool_resources={
            "file_search": {
                "vector_store_ids": [vector_store.id]
            }
        }
    )

    client.beta.threads.messages.create(
        thread_id=thread.id,
        content=prompt_content,
        attachments=[{
            "file_id": file.id,
            "tools": [{"type": "file_search"}]
        }],
        role="user"
    )

    try:
        run = client.beta.threads.runs.create_and_poll(
            thread_id=thread.id,
            assistant_id=assistant.id,
            max_completion_tokens=1500
        )

        if run.status != "completed":
            print("DEBUG:", run)
            raise ("Run failed:", run.status)
    except:
        response_content = None
    else:
        messages_cursor = client.beta.threads.messages.list(thread.id)
        messages = [message for message in messages_cursor]
        message = messages[0]

        response_content: str = message.content[0].text.value
    try:
        client.files.delete(file.id)
    except:
        print("File deletion failed")

    try:
        client.beta.vector_stores.delete(vector_store.id)
    except:
        print("Vector store deletion failed")

    try:
        client.beta.threads.delete(thread.id)
    except:
        print("Thread deletion failed")

    return response_content

In [99]:
# Get a list of documents to summarize.

with open("../../../dataset/_merged.json") as f:
    invalid_documents = json.load(f)

print(json.dumps(invalid_documents, indent=2, ensure_ascii=False))

[
  {
    "pk": "8288e199-5927-4eb1-8c40-07387287264f",
    "filename": "8288e199-5927-4eb1-8c40-07387287264f.pdf",
    "title": "Odporúčanie pre softvérových inžinierov"
  },
  {
    "pk": "9e5b6ea5-2728-4496-9e49-82732bbcabc7",
    "filename": "9e5b6ea5-2728-4496-9e49-82732bbcabc7.pdf",
    "title": "Úvod do matematickej logiky"
  },
  {
    "pk": "55045d27-c5f1-4c31-b1ed-fccd2eb0d676",
    "filename": "55045d27-c5f1-4c31-b1ed-fccd2eb0d676.pdf",
    "title": "Algebra a diskrétna  matematika"
  },
  {
    "pk": "dd4675da-d675-4d11-ad8c-10860daab977",
    "filename": "dd4675da-d675-4d11-ad8c-10860daab977.pdf",
    "title": "Základy digitálnych mien a blockchain sietí"
  },
  {
    "pk": "c6566bb1-6d34-4c8b-a967-785e4ccf9492",
    "filename": "c6566bb1-6d34-4c8b-a967-785e4ccf9492.pdf",
    "title": "Quantum Computing for Everyone"
  },
  {
    "pk": "5add311d-cc6e-4d67-9b18-8301a1e7d070",
    "filename": "5add311d-cc6e-4d67-9b18-8301a1e7d070.pdf",
    "title": "Sprievodca licenciami Cre

In [253]:
results = []

for i, document in enumerate(invalid_documents):
    title = document['title']
    filename = document['filename']

    print(f"[{i}/{len(invalid_documents) - 1}]: {title}")

    try:
        file = open(f"../../../dataset/{filename}", "rb")
    except:
        print(f"File not found")
        print()
        continue

    summary = get_summary(file)

    print(summary)
    print()

    results.append({
        "id": document['pk'],
        "title": document['title'],
        "summary": summary,
    })

    # Sometimes a delay is needed to avoid too many tokens-per-minute
    # when large documents are uploaded.
    # time.sleep(65)

[0/45]: Melting Hadrons, Boiling Quarks: From Hagedorn Temperature to Ultra-Relativistic Heavy-Ion Collisions at CERN: With a Tribute to Rolf Hagedorn
The document discusses the concept of the Hagedorn temperature, a critical limit in the study of hadronic matter, and its significance in understanding the transition to a quark-gluon plasma (QGP). Hagedorn proposed that as energy is added to a system of strongly interacting particles, the temperature does not rise indefinitely; instead, it reaches a plateau at the Hagedorn temperature, approximately between 150 MeV and 160 MeV. This phenomenon implies an exponential increase in the number of hadronic states, necessitating a revision of statistical physics equations to accommodate numerous massive hadron resonances. The Statistical Bootstrap Model (SBM) is introduced to explain how hadrons can become excited into heavier resonances, forming a complex hierarchy of particles. As the temperature approaches the Hagedorn limit, matter transit

In [254]:
# Save generated summaries to a json file.
# Expects an existing json file with a (empty or non-empty) list.

with open(filename, "r") as f_read:
    summaries = json.load(f_read)

with open(filename, "w") as f_write:
    summaries.extend(results)

    json.dump(
        summaries,
        f_write,
        indent=2,
        ensure_ascii=False
    )

In [93]:
# Delete the assistant after we are done.

res = client.beta.assistants.delete(assistant.id)
print(res)

AssistantDeleted(id='asst_Chti2DHxnkbvZ8gfhlo7mkhb', deleted=True, object='assistant.deleted')


# Retry

After the first run, it's necessary to manually remove completions that are actually error messages, rather than summaries. After replacing them with None/null, the following retry logic can be used.


In [103]:
# Extract summaries with None/null values.

with open(filename, "r") as file:
    summaries = json.load(file)

invalid_documents = [{
    "id": document["id"],
    "title": document["title"],
    "filename": f"{document["id"]}.pdf"
} for document in summaries if document['summary'] is None]

invalid_documents

[{'id': '2c6911e8-3192-4c6e-a038-8bb0b3e109f3',
  'title': 'The Social Life of Economic inequalities in Contemporary Latin America : Decades of Change',
  'filename': '2c6911e8-3192-4c6e-a038-8bb0b3e109f3.pdf'},
 {'id': '2222137f-e740-4ab2-9fd3-a2333e04e9f9',
  'title': 'Model-Based Demography : Essays On integrating Data, Technique and Theory',
  'filename': '2222137f-e740-4ab2-9fd3-a2333e04e9f9.pdf'},
 {'id': 'b3332145-46d1-4584-b661-97aa4bb95c66',
  'title': 'Talent Development in European Higher Education : Honors Programs in The Benelux, Nordic and German-Speaking Countries',
  'filename': 'b3332145-46d1-4584-b661-97aa4bb95c66.pdf'},
 {'id': 'b13f73fc-cdfc-4ba4-a22c-b9dc1d235c2a',
  'title': 'Mercury Pollution in Minamata',
  'filename': 'b13f73fc-cdfc-4ba4-a22c-b9dc1d235c2a.pdf'},
 {'id': 'f8c4853c-f3e9-46f7-9575-67453d801211',
  'title': 'Advances in Discrete Differential Geometry',
  'filename': 'f8c4853c-f3e9-46f7-9575-67453d801211.pdf'},
 {'id': 'de473528-bd87-40f7-aa25-5e1be

In [98]:
# Produce summaries for the invalid documents.

results = []

for i, document in enumerate(invalid_documents):
    title = document['title']
    doc_filename = document['filename']

    print(f"[{i}/{len(invalid_documents) - 1}]: {title}")

    try:
        file = open(f"../../../dataset/{doc_filename}", "rb")
    except:
        print(f"File not found")
        print()
        continue

    final_summary = None
    retries = 0

    while final_summary is None and retries < 3:
        summary = get_summary(file)
        print(summary)

        if summary is None:
            retries += 1
            time.sleep(60)
            continue

        final_summary = summary

        print()

    results.append({
        "id": document['id'],
        "title": document['title'],
        "summary": final_summary,
    })

[0/49]: The Social Life of Economic inequalities in Contemporary Latin America : Decades of Change
File not found

[1/49]: Model-Based Demography : Essays On integrating Data, Technique and Theory
File not found

[2/49]: Talent Development in European Higher Education : Honors Programs in The Benelux, Nordic and German-Speaking Countries
File not found

[3/49]: Mercury Pollution in Minamata
File not found

[4/49]: Advances in Discrete Differential Geometry
File not found

[5/49]: The Illusion of Risk Control : What Does It Take To Live With Uncertainty?
File not found

[6/49]: The Eu and China in African Authoritarian Regimes : Domestic Politics and Governance Reforms
File not found

[7/49]: Bridging Educational Leadership, Curriculum Theory and Didaktik : Non-Affirmative Theory of Education
File not found

[8/49]: Understanding The Bigger Energy Picture : Desertec and Beyond
File not found

[9/49]: Uses of Technology in Lower Secondary Mathematics Education : a Concise Topical Survey


In [70]:
# Update the summaries json file with the new summaries.

with open(filename, "r") as f_read:
    summaries = json.load(f_read)

for document in results:
    if document["summary"] is None:
        continue

    for summary in summaries:
        if summary["id"] == document["id"]:
            summary["summary"] = document["summary"]
            break

with open(filename, "w") as f_write:
    json.dump(
        summaries,
        f_write,
        indent=2,
        ensure_ascii=False
    )