In [15]:
# set up deps

import dotenv
import os
import sys
from google import genai
dotenv.load_dotenv(".env", override=True)
client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
sys.path.insert(0, os.path.abspath('app'))


In [2]:
# convert pdf to txt

import os 
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

data_dir = "data"
product_text_dict = {}
for filename in os.listdir(f"{data_dir}/pdf"):
    with open(f'{data_dir}/txt/{filename.replace("pdf", "txt")}', "w") as f:
        text = extract_text_from_pdf(f"{data_dir}/pdf/{filename}")
        f.write(text) 
        product_text_dict[filename] = text
        print("=====")
        print(text)
        


=====
Produktfamilien-Vorteile
_Kurzbogen mit sehr hoher Leuchtdichte für hellere Leinwandausleuchtung
_Konstante Farbtemperatur von 6.000 K über die gesamte Lebensdauer der Lampe
_Einfach zu warten
_Hohe Lichtbogenstabilität
_Sofort Licht auf der Leinwand dank Heißwiederzündungsfunktion
_Breiter Dimmbereich
Produktfamilien-Eigenschaften
_Farbtemperatur: ca. 6.000 K (Daylight)
_Leistung: 450…10.000 W
_Sehr guter Farbwiedergabeindex: Ra >
Produktdatenblatt
 
XBO 10000 W/HS OFR
XBO for cinema projection | Xenon-Kurzbogenlampen 450…10.000 W
 Anwendungsgebiete
_Klassische 35-mm-Filmprojektion
_Digitale Film- und Videoprojektion
_Architektur- und Effektlicht („Light Finger“)
_Sonnensimulation
__
7. Januar 2024, 20:18:27 © 2024, OSRAM GmbH. Alle Rechte vorbehalten.
XBO 10000 W/HS OFR Seite 1 von 4Technische Daten
Elektrische Daten
Nennstrom 195,00 A
Stromsteuerbereich 160…200 A
Nennleistung 10000,00 W
Nennspannung 50,0 V
Abmessungen & Gewicht
 
Durchmesser 90,0 mm
Länge 436,0 mm
Länge mit So

In [None]:
# from a sample of 5 create pydantic model. This took multiple shots to get the types right.

product_texts = {k: product_text_dict[k] for k in list(product_text_dict.keys())[:5]}

prompt = f"""I am supplying a set of documents of products. 
I would like to have all information transformed to structured data. 
Please identify all fields of the products.
Include any fields that is present in atleast 2 products.
Output should include a pydantic model with English variable names and German annotations(comments).
Make sure to use number formats whenever possible, and include quantitative unit in variable names.
Make sure to represent ranges in two separate fields instead of one field.
Also provide an example object initialization.
{product_texts}"""

response = client.models.generate_content(
    model="gemini-2.5-pro-exp-03-25",
    contents=prompt
)
print(response.text)

In [None]:
# create database schema based on model (accidentally I did it in my browser)

In [11]:
# create db population logic

from app.models import XenonLamp

with open("init_db.sql", "r") as f:
    init_db_text = f.readlines()
    prompt = f"""Based on the init db script and the json schema I provide, implement a python function that can insert an array of jsons into the database
     json schema: {XenonLamp.model_json_schema()}
"""
print(prompt)
response = client.models.generate_content(
    model="gemini-2.5-pro-exp-03-25",
    contents=prompt
)

print(response.text)

In [25]:
# extract structured data from json
import json
for file_name, doc in product_text_dict.items():
    prompt = f"""
    Extract structured product information from the following text.
    The text is from a technical product description document in German.
    Extract data for the following fields (with their German names in parentheses):
    Return ONLY a valid JSON object that matches supplied model schema:
    Use null for missing values. Include only fields that have information in the document.
    Make sure to strip leading "_" characters consistently.
    Dates should fbe formatted in the following format: yyyy-mm-dd
    Text:
    {doc}
    Format your response as a valid JSON object that matches the above schema. Do not include any explanation or extra text.
    """
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        config={
        "response_mime_type": "application/json",
        "response_schema": XenonLamp,
    },
        contents=prompt
    )
    output_path = os.path.join("data/json", file_name.replace(".pdf", ".json"))
    with open(output_path, "w", encoding="utf-8") as f:
        product_json = json.loads(response.text)
        product_json["content"] = doc
        f.write(json.dumps(product_json, indent=2))

In [6]:
# count tokens. I decide not to chunk the documents.

import tiktoken

def count_tokens(text, encoding_name="cl100k_base"):
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except KeyError:
        print(f"Warning: encoding {encoding_name} not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    
    tokens = encoding.encode(text)
    return len(tokens)

for k,v in product_text_dict.items():
    print(count_tokens(v))

1194
1162
1128
1139
1159
1205
1141
1181
1176
1185
1221
1132
1213
1147
1136
1138
1204
1162
1079
1152
1128


In [None]:
# I implemented embedding the documents without chunking, however for this dataset I do not see it very useful as documents are very similiar.
# My intuitions tell me that I will rely on the sql queries to retrieve the correct document if its needed.

from app.embedding import XenonLampEmbeddingSystem
import psycopg2

from app.populate_db import get_db_params_from_env

embedding_system = XenonLampEmbeddingSystem()
query_embedding = embedding_system.generate_embedding("4500 W/HS")

conn = psycopg2.connect(**get_db_params_from_env())
cur = conn.cursor()
cur.execute("""
    SELECT id, content, 1 - (embedding <=> %s::vector) AS similarity
    FROM xenon_lamps
    ORDER BY embedding <=> %s::vector
    LIMIT 5;
""", (query_embedding.tolist(), query_embedding.tolist()))
results = cur.fetchall()
for row in results:
    print(row)

cur.close()
conn.close()

(7, 'Produktfamilien-Vorteile\n_Kurzbogen mit sehr hoher Leuchtdichte für hellere Leinwandausleuchtung\n_Konstante Farbtemperatur von 6.000 K über die gesamte Lebensdauer der Lampe\n_Einfach zu warten\n_Hohe Lichtbogenstabilität\n_Sofort Licht auf der Leinwand dank Heißwiederzündungsfunktion\n_Breiter Dimmbereich\nProduktfamilien-Eigenschaften\n_Farbtemperatur: ca. 6.000 K (Daylight)\n_Leistung: 450…10.000 W\n_Sehr guter Farbwiedergabeindex: Ra >\nProduktdatenblatt\n \nXBO 10000 W/HS OFR\nXBO for cinema projection | Xenon-Kurzbogenlampen 450…10.000\xa0W\n Anwendungsgebiete\n_Klassische 35-mm-Filmprojektion\n_Digitale Film- und Videoprojektion\n_Architektur- und Effektlicht („Light Finger“)\n_Sonnensimulation\n__\n7. Januar 2024, 20:18:27 © 2024, OSRAM GmbH. Alle Rechte vorbehalten.\nXBO 10000 W/HS OFR Seite 1 von 4Technische Daten\nElektrische Daten\nNennstrom 195,00 A\nStromsteuerbereich 160…200 A\nNennleistung 10000,00 W\nNennspannung 50,0 V\nAbmessungen & Gewicht\n \nDurchmesser 90,