In [2]:
import time
import json
from openai import OpenAI
from dotenv import dotenv_values


In [3]:
env = dotenv_values()

In [4]:
client = OpenAI(api_key=env['OPENAI_API_KEY'])
assistant_id = "asst_VdhqgK4Hy3hwkUdr6NBSsas7"


# Helpers


## File upload


In [229]:
files_cursor = client.files.list(limit=100)
files = [file for file in files_cursor]

for file in files:
  print(file.id, file.filename)


file-VYEU7NsGqim4K6T9KRLreS f9fb2ac7-a262-4650-be40-8cf2e46a8fdc.pdf
file-5e9px6frkpHDopN6VWFQSw f9fb2ac7-a262-4650-be40-8cf2e46a8fdc.pdf
file-ATMpjNtUmctSy14DVqCPTz 0c0fcd32-e3a5-4585-a5f7-7f347c787d97.pdf


In [230]:
for file in files:
  res = client.files.delete(file.id)
  print(res)


FileDeleted(id='file-VYEU7NsGqim4K6T9KRLreS', deleted=True, object='file')
FileDeleted(id='file-5e9px6frkpHDopN6VWFQSw', deleted=True, object='file')
FileDeleted(id='file-ATMpjNtUmctSy14DVqCPTz', deleted=True, object='file')


## Vector stores


In [6]:
vector_stores = client.beta.vector_stores.list(limit=100)

for vector_store in vector_stores:
  res = client.beta.vector_stores.delete(vector_store.id)
  print(res)


## Messages


In [192]:
messages_cursor = client.beta.threads.messages.list(thread_id)
messages = [message for message in messages_cursor]

for message in messages:
  print(message.id, message.content[0].text.value)


msg_eRFSTD20yUgBVeN2bTf3xqXG The document is a comprehensive introduction to mathematical logic, aimed primarily at first-year students at the Slovak Technical University in Bratislava. It outlines the fundamental concepts and techniques of both propositional and predicate logic, including the construction and interpretation of logical formulas. The text is organized into chapters that cover various topics, such as truth tables, tautologies, formal systems, semantic trees, and the resolution method, along with applications in electronics and neural networks. It elaborates on the Gentzen calculus, a syntactic method utilizing sequents for proof derivation, and delves into modal and many-valued logics, indicating different types of modal logic, including alethic, epistemic, temporal, and deontic. Each chapter serves as a lecture on its own, structured to facilitate self-study and includes numerous exercises to solidify understanding. The authors emphasize the necessity of a basic underst

In [193]:
for message in messages:
  client.beta.threads.messages.delete(
    message_id=message.id,
    thread_id=thread_id
  )


# Pipeline

In [30]:
assistant = client.beta.assistants.create(
  name="asst_bachelor_summaries",
  instructions="You are a helpful document summarization assistant that follows user's instructions.",
  model="gpt-4o-mini",
  tools=[{ "type": "file_search" }]
)


In [31]:
# prompt_content = "Summarize the provided document into a single paragraph of about 300 words, in the primary language that is most used throughout the document. Do not use any numbered lists or bullet points, do not cite sources. Return just plain text."

prompt_content = "Summarize the provided document into a single paragraph of about 250 to 350 words, in the primary language that is most used throughout the document. Do not use any numbered lists or bullet points, do not cite sources. Return just plain text. Talk only about the contents, not the document itself."

def get_summary(filename):
  file = client.files.create(
    file=open(f"../../../dataset/{filename}", "rb"),
    purpose="assistants",
  )
  
  vector_store = client.beta.vector_stores.create(
    file_ids=[file.id],
  )

  thread = client.beta.threads.create(
    tool_resources={
      "file_search": {
        "vector_store_ids": [vector_store.id]
      }
    }
  )
  
  client.beta.threads.messages.create(
    thread_id=thread.id,
    content=prompt_content,
    attachments=[{
      "file_id": file.id,
      "tools": [{ "type": "file_search" }]
    }],
    role="user"
  )

  run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id,
    assistant_id=assistant.id,
  )

  if run.status != "completed":
    raise Exception("Run failed:", run.status)

  messages_cursor = client.beta.threads.messages.list(thread.id)
  messages = [message for message in messages_cursor]
  message = messages[0]

  response_content = message.content[0].text.value

  client.files.delete(file.id)
  client.beta.vector_stores.delete(vector_store.id)
  client.beta.threads.delete(thread.id)

  return response_content


In [32]:
# Single document

with open("../../../dataset/_merged.json") as f:
  documents = json.load(f)
  documents = [documents[0]]

print(documents)

[{'pk': '8288e199-5927-4eb1-8c40-07387287264f', 'filename': '8288e199-5927-4eb1-8c40-07387287264f.pdf', 'title': 'Odporúčanie pre softvérových inžinierov'}]


In [None]:
# Batch of documents

with open("../../dataset/_merged.json") as f:
  documents = json.load(f)
  documents = [documents[0]]

print(documents)


In [33]:
results = []

for i, document in enumerate(documents):
  title = document['title']
  filename = document['filename']
  summary = get_summary(filename)

  print(f"[{i + 1}/{len(documents)}]: {title}")
  print(summary)
  print

  results.append({
    "id": document['pk'],
    "title": document['title'],
    "summary": summary,
  })

  # time.sleep(65)


[1/1]: Odporúčanie pre softvérových inžinierov
Odporúčacie systémy v oblasti softvérového inžinierstva zohrávajú kľúčovú úlohu pri podpore vývojárov pri rôznych úlohách, ako sú znovupoužitie zdrojového kódu a efektívne písanie chybových hlásení. Tieto systémy analýzujú zdrojový kód a iné artefakty, aby poskytli relevantné odporúčania týkajúce sa používania aplikačných rozhraní (API) a knižníc, ako aj pomoc pri tvorbe nových metód, opravách chýb a zlepšovaní znovupoužiteľnosti. Vývoj odporúčacieho systému je proces, ktorý si vyžaduje starostlivé plánovanie a rozhodovanie. Existujú dva hlavné prístupy, ktoré využívač odporúčacích systémov aplikuje: filtrovanie na základe obsahu, kde sa odporúča na základe podobnosti vlastností aktuálneho dokumentu, a kolaboratívne filtrovanie, ktoré sa opiera o preferencie ostatných používateľov. Systémy sú navrhnuté tak, aby riešili komplexné problémy, ako sú adaptácia nováčikov v projektoch a efektívne rozdelenie úloh vo veľkých projektoch. Taktiež sa 

In [None]:
with open('01_summaries.json', 'w') as f:
  json.dump(results, f, indent=2, ensure_ascii=False)


In [34]:
res = client.beta.assistants.delete(assistant.id)
print(res)


AssistantDeleted(id='asst_oqvaFiHn4c085L0RMOJre7Ak', deleted=True, object='assistant.deleted')
