In [1]:
!pip install langchain langchain-google-genai docling



In [68]:
# 0. Import libraries
from docling.document_converter import DocumentConverter
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from pathlib import Path

import google.generativeai as genai
import json
import os
import re

In [3]:
# 1. Configure LLM settings (Gemini with API)
os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
OUTPUT_JSON_PATH = 'extracted_data.json'

try:
    genai.configure(api_key=os.environ['GOOGLE_API_KEY'])
except Exception as e:
    print(f"❌ | Error configuring Gemini API: {e}")
    exit()

In [81]:
# 2. Read a PDF file and convert to Markdown (To be parsed later)
def convert_markdown(source, markdown_path, num_pages=10):
  converter = DocumentConverter()
  try:
    result = converter.convert(source,
                               page_range=(1,num_pages))

    print("✅ | Sucess on extracting data from PDF...")
    with open(markdown_path, "w") as f:
      f.write(result.document.export_to_markdown())
    return result
  except Exception as e:
    print(f"❌ | Error extracting data from PDF: {e}")
    return None

In [99]:
# 3. Invoke LLM question using Markdown as context to create the json file
def generate_json_from_text(text, json_path):
  llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2)

  prompt_template = """
  You are a professional academic paper parser designed to extract metadata into a JSON format.

  Based on the following text from a dissertation or thesis, please extract the required information.
  The source text may contain character encoding errors (e.g., "SIMULAC¸ ˜AO" instead of "SIMULAÇÃO").
  Please correct these errors to produce clean, readable text in your final JSON output.

  If a specific piece of information is not found, leave the corresponding field blank ("").
  Return *only* the raw JSON object, without any surrounding text, explanations, or markdown formatting.
  Important! The fields that are in another language other than english MUST be translated to english.

  JSON fields to extract:
  1. paper_name: The full title of the paper.
  2. author_name: The full name of the main author.
  3. main_supervisor_name: The full name of the primary supervisor, advisor, or orientador.
  4. keywords: A list of relevant keywords, often found in an abstract or keyword section.
  5. year_release: The year the paper was published or defended.
  6. pgp_name: The name of the post-graduate program (e.g., Master's in "FIELD_HERE", PhD in "FIELD_HERE"... always in this format).

  Here is the paper:
  ---
  {document_text}
  ---
  """
  prompt = PromptTemplate.from_template(prompt_template)
  chain = prompt | llm

  try:
      print("🧠 | Sending request to Gemini Flash...")
      response = chain.invoke({"document_text": text})
      cleaned_response = re.sub(r'```json\s*|\s*```', '', response.content.strip(), flags=re.MULTILINE)

      print("🔄 | Received response. Attempting to parse JSON...")
      save_to_json(json.loads(cleaned_response), json_path)

  except json.JSONDecodeError:
      print("❌ | Failed to decode JSON. The model's response was not valid JSON.")
      print("Model Output:\n", response.text)
  except Exception as e:
      print(f"❌ | An error occurred while calling the Gemini API: {e}")

In [95]:
# 4. Save the response as a JSON file
def save_to_json(data, file_path):
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"✅ | Done! JSON Data saved to '{file_path}'")
        print(json.dumps(data, ensure_ascii=False, indent=4))
    except Exception as e:
        print(f"❌ | Error saving data to JSON file: {e}")

In [103]:
# 5. (Static) Testing json output with example pdf file
pdf_path = '/content/tese_zamberlan.pdf'
markdown_path = Path(pdf_path).stem+'.md'
json_path = Path(pdf_path).stem+'.json'

if os.path.exists(markdown_path):
    print(f"🔄 | Reading markdown from file '{markdown_path}'")
    with open(markdown_path, "r") as f:
        pdf_text = f.read()
    print(f"✅ | Read {len(pdf_text)} characters markdown from '{markdown_path}'")
else:
    print(f"🔄 | Converting PDF '{pdf_path}' to markdown...")
    pdf_result = convert_markdown(pdf_path, markdown_path, num_pages=30)
    if pdf_result:
        pdf_text = pdf_result.document.export_to_markdown()
    else:
        pdf_text = None

if pdf_text:
  generate_json_from_text(pdf_text, json_path)

🔄 | Converting PDF '/content/tese_zamberlan.pdf' to markdown...
✅ | Sucess on extracting data from PDF...
🧠 | Sending request to Gemini Flash...
🔄 | Received response. Attempting to parse JSON...
✅ | Done! JSON Data saved to 'tese_zamberlan.json'
{
    "paper_name": "Multi-agent system for evaluating the agglomeration effect in polymeric nanoparticles",
    "author_name": "Alexandre de Oliveira Zamberlan",
    "main_supervisor_name": "Solange Binotto Fagan",
    "keywords": [
        "Nanoscience",
        "Characterization of Nanoparticles",
        "Computer Science",
        "Event-Oriented Simulation",
        "Agents"
    ],
    "year_release": "2018",
    "pgp_name": "PhD in Nanosciences"
}
