## Import PMG XML to Dataframe

In [1]:
# %pip install lxml
import pandas as pd
# import numpy as np
from lxml import etree


In [2]:
#import filenames of directory documents/pmg to pandas dataframe

import os

print("Current working directory:", os.getcwd())
path = '~/Dropbox/dev/DVV/dvv_content/pmg/'
full_path = os.path.expanduser(path)
files = os.listdir(full_path)

print(f"Inserted {len(files)} files")


Current working directory: /Users/mweber/dev/chatdvv
Inserted 3398 files


In [3]:
import os
from dotenv import load_dotenv

from pymongo import MongoClient
from bson import ObjectId

# Init MongoDB Client
load_dotenv()
mongoClient = MongoClient(os.environ.get('MONGO_URI_DVV'))
database = mongoClient.dvv_content_pool
collection = database.dvv_artikel

In [4]:
print(collection)

Collection(Database(MongoClient(host=['ac-fzhvzuu-shard-00-00.aev3rng.mongodb.net:27017', 'ac-fzhvzuu-shard-00-01.aev3rng.mongodb.net:27017', 'ac-fzhvzuu-shard-00-02.aev3rng.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin', replicaset='atlas-gxqdk3-shard-0', tls=True), 'dvv_content_pool'), 'dvv_artikel')


In [19]:
file = "martin.xml"
print(file[-3:])

xml


In [None]:
#---------------------------------------------
# 22.06.2024 added fields quelle_id_embeddings, text_length
# 25.10.2024 added schlagworte
# 18.03.2024 added safe_int
#---------------------------------------------

from datetime import datetime

def safe_int(x: str = "") -> int:
    try:
        return int(x)
    except:
        return 0


data_collection = []

for file in files:

    # nsh - no attribute text
    # raili - no attribute jahrgang
    if file[:3] in ["nsh", "rai"]:
        continue

    if file[-3:] != "xml":
        continue

    if collection.find_one({"dateiname": file}):
        continue

    tree = etree.parse(full_path + f'{file}')
    root = tree.getroot()

    data = []
    for elem in root.findall('artikel'):
        print(f"Processing {file} {elem.find('metadaten/artikel-id').text}")
        
        # Metadaten auslesen
        artikel_id = elem.find('metadaten/artikel-id').text
        quelle_id = elem.find('metadaten/quelle/quelle-id').text
        jahrgang = safe_int(elem.find('metadaten/quelle/jahrgang').text)
        nummer = safe_int(elem.find('metadaten/quelle/nummer').text)
        datum = elem.find('metadaten/quelle/datum').text
        seite_start = safe_int(elem.find('metadaten/quelle/seite-start').text)
        seite_ende = safe_int(elem.find('metadaten/quelle/seite-ende').text)

        # Inhalt auslesen
        title = elem.find('inhalt/titel-liste/titel').text
        if elem.find('inhalt/titel-liste/untertitel') is not None:
            untertitel = elem.find('inhalt/titel-liste/untertitel').text
        else:
            untertitel = "leer"
        text_struktur = elem.find('inhalt/text')
        text_content = "".join(text_struktur.itertext())
        text_length = len(text_content)

        # Felder hinzufügen
        ki_abstract = ""
        date = datetime.strptime(datum, "%d%m%Y")
        text_embeddings = []
        ki_embeddings = []
        quelle_id_embeddings = []
        schlagworte = []
        

        # if quelle_id[:3] == "DVZ":
        data.append([file, artikel_id, quelle_id, jahrgang, nummer, datum, seite_start, seite_ende, title, untertitel, text_content, ki_abstract, date, text_embeddings, ki_embeddings, quelle_id_embeddings, text_length, schlagworte])
    
    data_collection.extend(data)

# read xml file and convert it to a pandas dataframe
df = pd.DataFrame(data_collection, columns=['dateiname', 'artikel_id', 'quelle_id', 'jahrgang', 'nummer', 'datum', 'seite_start', 'seite_ende', 'titel', 'untertitel', 'text', 'ki_abstract', 'date', 'text_embeddings', 'ki_embeddings', 'quelle_id_embeddings', 'text_length', 'schlagworte'])
# df.head(20)
print(f"Inserted {len(df)} records into DataFrame")

# print list of quelle_id grouped by count sorted by quelle_id
print(df.groupby('quelle_id').size().sort_values(ascending=False))


Inserted 0 records into DataFrame
Series([], dtype: int64)


## MongoDB: Import Articles

In [None]:
# Insert data into MongoDB usinf insert_many
data_input = df.to_dict(orient='records')
collection.insert_many(data_input)

print(f"Inserted {len(data_input)} records into MongoDB")

mongoClient.close()

## Add embeddings & keywords

In [1]:
import os
from dotenv import load_dotenv

from pymongo import MongoClient
from bson import ObjectId

# Init MongoDB Client
load_dotenv()
mongoClient = MongoClient(os.environ.get('MONGO_URI_DVV'))
database = mongoClient.dvv_content_pool
collection = database.dvv_artikel

In [6]:
cursor = collection.dvv_artikel.find({'schlagworte': []}).limit(5)
cursor_list = list(cursor)
print(len(cursor_list))
for record in cursor_list:
    print(f"ID: {record['_id']}")


0


In [1]:
import modules.ask_mongo as ask_mongo
# from .autonotebook import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ask_mongo.generate_embeddings(input_field="text", output_field="text_embeddings", max_iterations=50000)
ask_mongo.generate_keywords(input_field="ki_abstract", output_field="schlagworte", max_iterations=5)
# ask_mongo.generate_entities(input_field="ki_abstract", output_field="schlagworte", max_iterations=10)


## Manage Collection

In [7]:
import modules.ask_mongo as ask_mongo

def safe_int(x: str = "") -> int:
    try:
        return int(x)
    except:
        return 0

# cursor = ask_mongo.collection.find({"nummer": {"$type": "string", "$not": {"$regex": "^[0-9]+$"}}})
cursor = ask_mongo.collection.find({})
results = list(cursor)

print(f"Found {len(results)} records")

for result in results:
    print(f"{result["dateiname"]} {result["quelle_id"]} {result["nummer"]}")
    
    result["jahrgang"] = safe_int(result["jahrgang"])
    result["nummer"] = safe_int(result["nummer"])
    result["seite_start"] = safe_int(result["seite_start"])
    result["seite_ende"] = safe_int(result["seite_ende"])

    ask_mongo.collection.update_one({"_id": result["_id"]}, {"$set": result})



Found 77467 records
7592079-THB-08082018.xml THB 152
7592079-THB-27052019.xml THB 101
7592079-THB-08082018.xml THB 152
7592079-THB-27052019.xml THB 101
7592079-DVZT-03112021.xml DVZT 0
7592079-THB-27052019.xml THB 101
7592079-THB-27052019.xml THB 101
7592079-DVZT-03112021.xml DVZT BLUF
7592079-THB-27052019.xml THB 101
7592079-DVZT-03112021.xml DVZT BLUF
7592079-DVZT-03112021.xml DVZT BLUF
7592079-DVZT-11102023.xml DVZT BSGT
7592079-DVZT-11102023.xml DVZT BSGT
7592079-THB-27052019.xml THB 101
7592079-THB-08082018.xml THB 152
7592079-THB-08082018.xml THB 152
7592079-DVZT-03112021.xml DVZT BLUF
7592079-THB-08082018.xml THB 152
7592079-THB-08082018.xml THB 152
7592079-THB-08082018.xml THB 152
7592079-DVZT-03112021.xml DVZT BLUF
7592079-THB-27052019.xml THB 101
7592079-THB-08082018.xml THB 152
7592079-THB-08082018.xml THB 152
7592079-THB-27052019.xml THB 101
7592079-DVZT-11102023.xml DVZT BSGT
7592079-THB-27052019.xml THB 101
7592079-THB-08082018.xml THB 152
7592079-DVZT-11102023.xml DVZT B