In [1]:
import requests
from bs4 import BeautifulSoup
from src.wikidataEntityDB import WikidataProperty, Session
from tqdm.auto import tqdm

#### Get the list of sorted properties & groups from Wikibase-SortedProperties page

In [2]:
r = requests.get("https://www.wikidata.org/wiki/MediaWiki:Wikibase-SortedProperties")
soup = BeautifulSoup(r.text, 'html.parser')

content = soup.find('div', class_='mw-content-ltr').find('meta')

# Dictionary to store extracted data
property_ids = {}

# Dictionary to track current hierarchy
hierarchy = {}

# Iterate over all children inside the main content
for section in content.children:
    if section.name and 'class' in section.attrs and 'mw-heading' in section.attrs['class']:
        # Determine heading level (h2, h3, etc.)
        heading_tag = section.find(['h2', 'h3', 'h4', 'h5', 'h6'])
        if heading_tag:
            level = int(heading_tag.name[1])  # Extract number from tag (h2 -> 2, h3 -> 3, etc.)
            title_text = heading_tag.get_text(strip=True)

            # Update hierarchy
            hierarchy[level] = title_text

            # Remove deeper levels when moving up
            hierarchy = {k: v for k, v in hierarchy.items() if k <= level}

    elif section.name in ['ul', 'ol']:
        # Extract PIDs
        pids = [li.text.split()[0] for li in section.find_all("li")]

        # Construct hierarchical title
        hierarchical_title = " > ".join(hierarchy.values())

        # Store data
        property_ids[hierarchical_title] = property_ids.get(hierarchical_title, []) + pids

#### Manually select which propery group should be included in the embeddings

In [None]:
property_types = {
    'Classification': 'full',
    'Images (1)': 'remove',
    'People (1)': 'full',
    'Names and titles': 'full',
    'Names and titles > Transliterations': 'full',
    'Names and titles > Pronunciation': 'full',
    'People (2)': 'full',
    'Personal details': 'full',
    'Mountains': 'full',
    'Bodies of water': 'full',
    'Biology': 'full',
    'Non-person entities': 'full',
    'Non-person entities > Economy': 'full',
    'Non-person entities > Tourism': 'full',
    'Works of art': 'full',
    'Language': 'full',
    'Events': 'full',
    'Events > Elections': 'full',
    'Events > Space flights': 'full',
    'Science': 'full',
    'Math': 'full',
    'Medicine': 'full',
    'Hardware, engineering': 'full',
    'Software': 'full',
    'Media content ratings': 'full',
    'Constructions': 'full',
    'Banking': 'full',
    'Dimensions': 'full',
    'Addressing': 'full',
    'Web': 'full',
    'Names': 'full',
    'Laws, treaties': 'full',
    'Court cases': 'full',
    'Encyclopedic articles': 'full',
    'Images (2)': 'remove',
    'Other properties by datatype > Other properties with datatype "item"': 'full',
    'Other properties by datatype > Other properties with datatype "Commons media file"': 'remove',
    'Other properties by datatype > Other properties with datatype "geographic coordinates"': 'full',
    'Other properties by datatype > Other properties with datatype "monolingual text"': 'full',
    'Other properties by datatype > Other properties with datatype "point in time"': 'full',
    'Other properties by datatype > Other properties with datatype "quantity"': 'full',
    'Other properties by datatype > Other properties with datatype "string"': 'full',
    'Other properties by datatype > Other properties with datatype "tabular data"': 'remove',
    'Other properties by datatype > Other properties with datatype "url"': 'full',
    'Other properties by datatype > Other properties with datatype "wikibase-lexeme"': 'remove',
    'Other properties by datatype > Other properties with datatype "wikibase-form"': 'remove',
    'Other properties by datatype > Other properties with datatype "math"': 'full',
    'Other properties by datatype > Other properties with datatype "wikibase-sense"': 'remove',
    'Other properties by datatype > Other properties with datatype "musical-notation"': 'remove',
    'Other properties by datatype > Other properties with datatype "wikibase-property"': 'full',
    'Other properties by datatype > Other properties with datatype "geo-shape"': 'remove',
    'Other properties by datatype > Other properties with datatype "entity-schema"': 'remove',
    'IDs with datatype "string"': 'full',
    'Classification (2)': 'full',
    'Commons, categories, templates, lists, portals': 'remove',
    'Properties not used as main statements on items and lexemes > Qualifier only': 'full',
    'Properties not used as main statements on items and lexemes > Qualifier and reference only': 'full',
    'Properties not used as main statements on items and lexemes > Reference only': 'remove',
    'Properties not used as main statements on items and lexemes > Property documentation': 'full',
    'Properties not used as main statements on items and lexemes > Sandbox properties': 'remove',
    'IDs with datatype "external-id" > ISO IDs': 'remove',
    'IDs with datatype "external-id" > ISO IDs > Humans and organisations': 'remove',
    'IDs with datatype "external-id" > ISO IDs > Languoids and scripts': 'remove',
    'IDs with datatype "external-id" > ISO IDs > Organisations and languoids in one ID': 'remove',
    'IDs with datatype "external-id" > ISO IDs > Works': 'remove',
    'IDs with datatype "external-id" > ISO IDs > Other': 'remove',
    'IDs with datatype "external-id" > VIAF-related IDs': 'remove',
    'IDs with datatype "external-id" > VIAF-related IDs > VIAF source IDs ordered by VIAF code': 'remove',
    'IDs with datatype "external-id" > WorldCat IDs': 'remove',
    'IDs with datatype "external-id" > Library classification IDs': 'remove',
    'IDs with datatype "external-id" > Chemistry IDs': 'remove',
    'IDs with datatype "external-id" > Chemistry-related IDs': 'remove',
    'IDs with datatype "external-id" > Chess player IDs': 'remove',
    'IDs with datatype "external-id" > Other IDs with type "external-id" - alphabetical order': 'remove',
    'IDs with datatype "external-id" > Genealogy IDs': 'remove'
 }

#### Push the filtering choices and sorting to the database

In [None]:
# Push Filtering Choice
for key, property_filter_value in tqdm(property_types.items()):
    ids = property_ids[key]
    with Session() as session:
        session.query(WikidataProperty).filter(WikidataProperty.id.in_(ids)).update(
            {WikidataProperty.property_filter: property_filter_value},
            synchronize_session=False
        )
        session.commit()

# Push Sorted Index
count = 0
for key, property_filter_value in tqdm(property_types.items()):
    ids = property_ids[key]
    for id in ids:
        count += 1
        session.query(WikidataProperty).filter(WikidataProperty.id == id).update({WikidataProperty.property_sort: count})
    session.commit()