In [21]:
import os 
from dotenv import load_dotenv
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [22]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    temperature=0, 
    model="gemini-2.5-pro", 
    api_key=GOOGLE_API_KEY
)

In [23]:
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader

docs = []
data_path = Path("articles/")
for file in data_path.glob("*.pdf"):
    try:
        loader = PyMuPDFLoader(file)
        docs.extend(loader.load())
        print(f"file loaded successfully: {file.name}")
    except Exception as e:
        print(f"Error loading {file.name}: {e}")
print(f"{len(docs)} documents loaded ")

file loaded successfully: 2505.08672v2.pdf
file loaded successfully: Teaching high school students about generative AI  Cases of teacher lesson design.pdf
44 documents loaded 


In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

In [25]:
summarize_prompt = """
# Persona
You are an expert Research Assistant with a specialization in analyzing academic papers at the intersection of Artificial Intelligence and Education.

# Context
Your purpose is to assist a first-year software engineering student who is researching Generative AI in education. You will be provided with one or more relevant academic articles via a retrieval system. Your task is to process the retrieved document(s) and generate a structured, two-part report that synthesizes the key information.

# Task & Instructions
Based on the retrieved context from the provided article(s), generate a report with two distinct parts.

## Part 1: Comparative Analysis
First, generate a comparative analysis. For EACH article, create a separate section with the following structure using Markdown headings:

### Article Title: [Insert Full Title of the Article]
- **Core Problem:** The main research question or problem the authors are addressing.
- **Methodology:** The research methods used (e.g., literature review, case study, experiment, survey).
- **Key Findings:** The most important conclusions or results of the study.
- **Relevance to GenAI in Education:** A brief statement on how this article contributes to the overall research topic.

--- (Use a horizontal rule to separate the articles)

## Part 2: Detailed Article Summaries
After the comparative analysis, provide a detailed summary for EACH article separately... (rest of the prompt is the same)

# Constraints
- **Tone:** The language must be formal, technical, and analytical.
- **Format:** Strictly adhere to the two-part structure (Comparative Analysis first, then Detailed Summaries). Use Markdown for all formatting.
- **Scope:** Your analysis must be based *only* on the information contained within the retrieved document(s). Do not introduce external knowledge.
"""

In [26]:
from pydantic import BaseModel, Field

class DocumentSummary(BaseModel):
    comparative_table: str = Field(..., description="A comparative analysis of the articles, with each article in its own section.")
    detailed_summaries: str = Field(..., description="Detailed summaries for each article as specified in the prompt.")


In [27]:
from langchain_core.messages import SystemMessage, HumanMessage

summarize_chain = llm.with_structured_output(DocumentSummary)

def summarize(retrieved_docs:list) -> DocumentSummary:
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    messages = [
        SystemMessage(content=summarize_prompt),
        HumanMessage(content=f"Context:\n{context}\n\nPlease generate the report as specified.")
    ]
    response = summarize_chain.invoke(messages)
    return response

summary = summarize(chunks)

In [28]:
import re

def tidy_markdown(text: str) -> str:
    text = text.strip()
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text

summary.comparative_table = tidy_markdown(summary.comparative_table)
summary.detailed_summaries = tidy_markdown(summary.detailed_summaries)


In [29]:
import datetime

def summary_to_markdown(summary: DocumentSummary, output_path: str = "summaries"):
    os.makedirs(output_path, exist_ok=True)
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = os.path.join(output_path,f"summary_{timestamp}.md")

    try:
        with open(filename, 'w', encoding='utf-8')as f:
            f.write("# AI generated document summary:\n\n")
            f.write("## Comparative Analysis\n\n")
            f.write(summary.comparative_table)
            f.write("\n\n## Detailed Summaries\n\n")
            f.write(summary.detailed_summaries)
        print(f"Summary saved successfully to {filename}")
    except Exception as e:
        print(f"Error saving summary to file: {e}")
    return filename


In [30]:
import mistune
from weasyprint import HTML

def markdown_to_pdf(mdfile: str):
    pdf_file = mdfile.replace('.md', '.pdf')

    try:
        with open(mdfile, 'r', encoding='utf-8') as f:
            md_content = f.read()
            html_content = mistune.html(md_content)
            HTML(string=html_content).write_pdf(pdf_file)
            print(f"PDF file saved successfully to: {pdf_file}")
    except Exception as e:
        print(f"Error converting markdown to PDF: {e}")

markdown_filename = summary_to_markdown(summary)
if markdown_filename:
    markdown_to_pdf(markdown_filename)

Summary saved successfully to summaries\summary_2025-09-16_18-14-46.md
PDF file saved successfully to: summaries\summary_2025-09-16_18-14-46.pdf
