In [6]:
from pydantic_ai import Agent
import nest_asyncio
from pathlib import Path
from dotenv import load_dotenv
import chromadb
import pandas as pd
import re
import openpyxl
from src.utilities.constants import USER_QUERY_COLLECTION_NAME, SPARQL_QUERY_COLLECTION_NAME
import json
nest_asyncio.apply()
load_dotenv()

True

In [4]:
def update_collection(create_collections : bool):
    file_path = Path.cwd() / "querySparql.xlsx"
    df = pd.read_excel(file_path, sheet_name="Foglio1", names=["question", "sparql_query"])
    ids = []
    queries = []
    for row in df.itertuples():
        user_query = re.sub("\n", " ", row[1])
        user_query = re.sub("\t", " ", user_query)

        s_query = re.sub("\n", " ", row[2])
        s_query = re.sub("\t", " ", s_query)

        queries.append({"user_query": user_query, "sparql_query": s_query})
        ids.append(f"ID_{row[0]}")

    
    client = chromadb.HttpClient(host='localhost', port=8000)
    if create_collections:
        client.create_collection(USER_QUERY_COLLECTION_NAME)
        client.create_collection(SPARQL_QUERY_COLLECTION_NAME)

    u_query_collection = client.get_collection(name=USER_QUERY_COLLECTION_NAME)
    s_query_collection = client.get_collection(name=SPARQL_QUERY_COLLECTION_NAME)


    for i, query in zip(ids, queries):
        u_query_collection.add(
            documents=query.get("user_query"),
            ids=i
        )
        s_query_collection.add(
            documents=query.get("sparql_query"),
            ids=i
        )

    return f"Collections updated\nCollections size: {u_query_collection.count(), s_query_collection.count()}"

update_collection(create_collections=False)

'Collections updated\nCollections size: (90, 90)'

In [8]:
def decorator(func):
    def wrapper(*args, **kwargs):
        print(f"args = {args}")
    return wrapper


@decorator
def test(query):
    pass

In [2]:
import asyncio
from SPARQLWrapper import SPARQLWrapper2, JSON
import requests
import json

In [3]:
query = "select ?author where {?doc a powla:Document; dcterms:creator ?author} limit 5"
params = {"query": query, "format": "json"}
endpoint = "https://lila-erc.eu/sparql/lila_knowledge_base/sparql"
res = requests.get(endpoint, params=params).json()


In [None]:
res

{'head': {'link': [], 'vars': ['author']},
 'results': {'distinct': False,
  'ordered': True,
  'bindings': [{'author': {'type': 'uri',
     'value': 'http://lila-erc.eu/data/corpora/id/authors/33616ad69195643c3e0ea58463e5029a'}},
   {'author': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q1067'}},
   {'author': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q549041'}},
   {'author': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q7198'}},
   {'author': {'type': 'literal', 'value': 'Gaius'}}]}}

In [23]:
import re
pattern = re.compile(r"https?://www\.wikidata.*?\'")
wiki_uris = pattern.findall(str(res))
clean_wiki_uris = [re.sub("'", "", uri) for uri in wiki_uris]


In [38]:
dante = requests.get("http://lila-erc.eu/data/corpora/id/authors/33616ad69195643c3e0ea58463e5029a")
egloghe = requests.get("https://lila-erc.eu/data/corpora/UDante/id/corpus/Egloghe")

In [32]:
from bs4 import BeautifulSoup

content = BeautifulSoup(dante.content, "html.parser")

In [35]:

html_content = BeautifulSoup(egloghe.content, "html.parser")
title        = html_content.find("h1")
span_element = title.find("span")
heading_txt  = span_element.get_text(strip=True)

In [36]:
heading_txt

'Egloghe'

In [21]:
import re


suffixes = {}
for r in res["results"]["bindings"]:
    lab = r["lab"]["value"]
    uri = r["pref"]["value"]
    label = re.sub("-", "", lab)
    number = re.findall(r'\d+', uri)
    prefix = re.sub(uri, f"prefix:{number[0]}", uri)
    suffixes.update({label: prefix})

with open("prefixes.json", "w") as f:
    json.dump(suffixes, f, indent=4)



In [23]:
with open("prefixes.json", "r") as f:
    prefixes = json.load(f)

prefixes.get("re")

'prefix:3'

In [10]:
prefixes

[{'a(b)': 'prefix:1'},
 {'in (entering)': 'prefix:2'},
 {'re': 'prefix:3'},
 {'sub': 'prefix:4'},
 {'ad': 'prefix:5'},
 {'con': 'prefix:6'},
 {'e(x)': 'prefix:7'},
 {'ob': 'prefix:8'},
 {'dis': 'prefix:9'},
 {'am(b)(i)': 'prefix:10'},
 {'bi': 'prefix:11'},
 {'ante': 'prefix:12'},
 {'prae': 'prefix:13'},
 {'per': 'prefix:14'},
 {'por': 'prefix:15'},
 {'pro': 'prefix:16'},
 {'archi': 'prefix:17'},
 {'circum': 'prefix:18'},
 {'se/sed/so': 'prefix:19'},
 {'in (negation)': 'prefix:20'},
 {'de': 'prefix:21'},
 {'contra': 'prefix:22'},
 {'ec': 'prefix:23'},
 {'extra': 'prefix:24'},
 {'indu/endo/indo': 'prefix:25'},
 {'inter': 'prefix:26'},
 {'tra(ns)': 'prefix:27'},
 {'intro': 'prefix:28'},
 {'multi': 'prefix:29'},
 {'ne': 'prefix:30'},
 {'post': 'prefix:31'},
 {'praeter': 'prefix:32'},
 {'pseudo': 'prefix:33'},
 {'quadri': 'prefix:34'},
 {'tri': 'prefix:35'},
 {'semi': 'prefix:36'},
 {'subter': 'prefix:37'},
 {'super': 'prefix:38'},
 {'ue': 'prefix:39'},
 {'retro': 'prefix:42'}]