## 10. Init

In [1]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
import hashlib

# Define constants ----------------------------------
load_dotenv()
mongoClient = MongoClient(os.environ.get('MONGO_URI_PV'))
database = mongoClient.pv_data_db
coll_ausgaben = database.ausgaben
coll_artikel = database.artikel

## 20. Import PDF-files, extract text, save to txt-file.

In [None]:
import pymupdf as fitz
import os

# Ordnerpfad
pdf_folder = 'ausgaben'
text_folder = 'ausgaben_txt'

# Erstellen Sie den Ordner für die Textdateien, falls er nicht existiert
if not os.path.exists(text_folder):
    os.makedirs(text_folder)

# Durchlaufen Sie alle PDF-Dateien im Ordner
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, filename)
        text_path = os.path.join(text_folder, f"{os.path.splitext(filename)[0]}.txt")
        
        # PDF-Datei öffnen
        pdf_document = fitz.open(pdf_path)
        
        # Text aus der PDF-Datei extrahieren
        text = ""
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        
        # Text in eine Textdatei schreiben
        with open(text_path, 'w', encoding='utf-8') as text_file:
            text_file.write(text)

print("Text aus allen PDF-Dateien wurde extrahiert und gespeichert.")

## 30. Import Ausgaben im TXT Format

In [None]:
txt_folder = "ausgaben_txt"

for filename in os.listdir(txt_folder):
    # Open file and read content
    with open(f"{txt_folder}/{filename}", "r") as file:
        content = file.read()

    # Create doknr
    ausgabe = filename[:-4].split(" ")[2][1:3]
    jahrgang = filename[:-4].split(" ")[3]
    doknr = f"{jahrgang}.{ausgabe}"

    # Check if hash already exists in database
    if coll_ausgaben.find_one({"doknr": doknr}):
        print(f"File {filename} already exists in database")
    else:
        # Insert new document into database
        coll_ausgaben.insert_one({
            "doknr": doknr,
            "jahrgang": int(jahrgang),
            "ausgabe": int(ausgabe),
            "text": content,
            })
        print(f"File {filename} added to database")

## 40. In Chunks zerlegen

In [4]:
def chunk_text_to_dataframe(text:str, chunk_size:int, overlap:int=0) -> list:
    """
    Splits a text into chunks and stores them in a Pandas DataFrame.

    Args:
        text: The input text string.
        chunk_size: The desired size of each chunk (number of characters).
        overlap: The number of overlapping characters between chunks.  Defaults to 0 (no overlap).

    Returns:
        A Pandas DataFrame where each row represents a chunk of text.  Returns an empty DataFrame if the input text is None or empty.
        Returns None if chunk_size is invalid (<= 0) or overlap is negative or greater than or equal to chunk_size.
    """

    if not text:  # Handle None or empty input
        return None

    if chunk_size <= 0:
      return None

    if overlap < 0 or overlap >= chunk_size:
      return None

    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))  # Ensure we don't go past the end of the text
        chunk = {'start': start, 'end': end, 'text': text[start:end]}
        chunks.append(chunk)
        start += chunk_size - overlap # Move start for the next chunk, accounting for overlap

    return chunks

## 50. Chunks in Collection ARTIKEL einfügen

In [None]:
cursor = coll_ausgaben.find()
for ausgabe in cursor:
    chunks = chunk_text_to_dataframe(ausgabe['text'], 5000, 500)
    for chunk in chunks:
        coll_artikel.insert_one({
            "doknr": ausgabe['doknr'],
            "start": chunk['start'],
            "end": chunk['end'],
            "text": chunk['text'],
            "embeddings": [],
            # "hash": hashlib.md5(chunk['text'].encode()).hexdigest()
        })


In [None]:
cursor = coll_artikel.find()
for artikel in cursor:
    print(f"{artikel['doknr']} [{artikel['start']}-{artikel['end']}]")


## 60. Embeddings in Collection ARTIKEL generieren

In [7]:
import os
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained model and tokenizer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_name = "bert-base-german-cased" # 768 dimensions
# model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Embeddings -------------------------------------------------            
def create_embeddings(text: str) -> list:
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state.mean(dim=1).squeeze().tolist()

def generate_embeddings(input_field: str, output_field: str, 
                        max_iterations: int = 0) -> None:
    if max_iterations != 0:
        cursor = coll_artikel.find({output_field: []}).limit(max_iterations)
    else:
        cursor = coll_artikel.find({output_field: []})
    cursor_list = list(cursor)
    for record in cursor_list:
        article_text = record[input_field]
        if article_text == "":
            article_text = "Fehler: Kein Text vorhanden."
        else:
            embeddings = create_embeddings(text=article_text)
            coll_artikel.update_one({"_id": record['_id']}, {"$set": {output_field: embeddings}})
    print(f"\nGenerated embeddings for {len(cursor_list)} records.")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
generate_embeddings("text", "embeddings")


Generated embeddings for 0 records.
