In [1]:
# importing necessities
import os
import json
import time
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI

In [2]:
# loading API
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# setting paths
DATA_DIR = Path("/Users/nataliepegues/data4380.np/llm_workspace/data")
INPUT_JSON = DATA_DIR / "papers.json"
OUTPUT_JSON = DATA_DIR / "summaries.json"

# loading papers data
with open(INPUT_JSON, "r") as f:
    papers = json.load(f)

summaries = []

# all used to manage token usage and minimize overall cost
MAX_TOKENS = 150  # tokens per summary chunk
RETRY_DELAY = 10  # seconds to wait before retry
MAX_RETRIES = 3   # retries per request
CHUNK_SIZE = 3000  # approx chars per chunk

#function to chunk the body of each paper to avoid going over token allotments
def chunk_text(text, max_length=CHUNK_SIZE):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_length
        if end < len(text):
            newline_pos = text.rfind('\n', start, end)
            if newline_pos != -1 and newline_pos > start:
                end = newline_pos
        chunks.append(text[start:end].strip())
        start = end
    return chunks

# main summarizing function
def summarize_text(text):
    retries = MAX_RETRIES
    while retries > 0:
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",  # model of choice, can be changed
                messages=[
                    {
                        "role": "user",
                        "content": (
                            "Please summarize the following psychology research paper text in one concise paragraph:\n\n"
                            + text
                        ),
                    }
                ],
                max_tokens=MAX_TOKENS,
                temperature=0.7,
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            error_msg = str(e).lower()
            if "rate limit" in error_msg:
                print(f"Rate limit hit, retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
                retries -= 1
            else:
                print(f"Error during summarization: {e}")
                break
    return ""

for paper in papers:
    body_text = paper["body"]
    filename = paper["filename"]
    print(f"Processing {filename}...")

    # chunking body text
    chunks = chunk_text(body_text)
    print(f" - Split into {len(chunks)} chunks.")

    # summarizing each chunk
    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        print(f"   Summarizing chunk {i+1} / {len(chunks)}")
        chunk_summary = summarize_text(chunk)
        if not chunk_summary:
            print(f"   Failed to summarize chunk {i+1}")
        chunk_summaries.append(chunk_summary)

    # combining chunk summaries
    combined_summary_text = " ".join(chunk_summaries).strip()
    print(" - Combining chunk summaries into final summary...")

    final_summary = summarize_text(combined_summary_text)
    if not final_summary:
        print(f"Failed to generate final summary for {filename}. Using combined chunk summaries instead.")
        final_summary = combined_summary_text

    summaries.append({
        "filename": filename,
        "abstract": paper["abstract"],
        "summary": final_summary,
    })

# saving summaries to json
with open(OUTPUT_JSON, "w") as f:
    json.dump(summaries, f, indent=2)

print(f"All done! Summaries saved to {OUTPUT_JSON}")

Processing psychpaper14.pdf...
 - Split into 26 chunks.
   Summarizing chunk 1 / 26
   Summarizing chunk 2 / 26
   Summarizing chunk 3 / 26
   Summarizing chunk 4 / 26
   Summarizing chunk 5 / 26
   Summarizing chunk 6 / 26
   Summarizing chunk 7 / 26
   Summarizing chunk 8 / 26
   Summarizing chunk 9 / 26
   Summarizing chunk 10 / 26
   Summarizing chunk 11 / 26
   Summarizing chunk 12 / 26
   Summarizing chunk 13 / 26
   Summarizing chunk 14 / 26
   Summarizing chunk 15 / 26
   Summarizing chunk 16 / 26
   Summarizing chunk 17 / 26
   Summarizing chunk 18 / 26
   Summarizing chunk 19 / 26
   Summarizing chunk 20 / 26
   Summarizing chunk 21 / 26
   Summarizing chunk 22 / 26
   Summarizing chunk 23 / 26
   Summarizing chunk 24 / 26
   Summarizing chunk 25 / 26
   Summarizing chunk 26 / 26
 - Combining chunk summaries into final summary...
Processing psychpaper15.pdf...
 - Split into 24 chunks.
   Summarizing chunk 1 / 24
   Summarizing chunk 2 / 24
   Summarizing chunk 3 / 24
   Summa