In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

from utils.structured_aligner import Aligner


from pymongo import MongoClient
import os
from dotenv import load_dotenv, find_dotenv

In [None]:
def get_head_triplets(entity_id):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?subjectLabel ?propertyLabel ?objectLabel ?object   WHERE {{

      SERVICE wikibase:label {{ 
        bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
      }}
      VALUES (?subject) {{(wd:{entity_id})}}  
      ?subject ?predicate ?object .
      ?property wikibase:directClaim ?predicate.

      FILTER(STRSTARTS(STR(?predicate), "http://www.wikidata.org/prop/direct/")) .
      FILTER(STRSTARTS(STR(?object), "http://www.wikidata.org/entity/")) .

    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    output_triplets = []

    for result in results["results"]["bindings"]:
        obj_id = result['object']['value'].split("/")[-1]
        subject = result["subjectLabel"]["value"]
        predicate = result["propertyLabel"]["value"]
        object_ = result["objectLabel"]["value"]
    
        output_triplets.append({"subject": subject, "predicate": predicate, "object": object_, "subj_id": entity_id,"obj_id": obj_id})
    
    return output_triplets

get_head_triplets("Q19837")

In [None]:
def get_tail_triplets(entity_id):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?subjectLabel ?propertyLabel ?objectLabel ?subject WHERE {{

      SERVICE wikibase:label {{ 
        bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" .
      }}

      VALUES (?object) {{(wd:{entity_id})}}  
      ?subject ?predicate ?object .
      ?property wikibase:directClaim ?predicate.

      FILTER(STRSTARTS(STR(?predicate), "http://www.wikidata.org/prop/direct/")) .
      FILTER(STRSTARTS(STR(?object), "http://www.wikidata.org/entity/")) .

    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    output_triplets = []
    for result in results["results"]["bindings"]:
        subject = result["subjectLabel"]["value"]
        predicate = result["propertyLabel"]["value"]
        object_ = result["objectLabel"]["value"]
        subj_id = result['subject']['value'].split("/")[-1]

        output_triplets.append({"subject": subject, "predicate": predicate, "object": object_, "subj_id": subj_id, "obj_id": entity_id})
    
    return output_triplets

get_tail_triplets("Q19837")

In [None]:
# jobs_triplets = get_head_triplets("Q19837") + get_tail_triplets("Q19837")
jobs_triplets = get_head_triplets("Q19837")
jobs_df = pd.DataFrame(jobs_triplets).drop_duplicates()
jobs_df

In [None]:
coll = db.get_collection('triplets')
coll.delete_many({"sample_id": 'wikidata-triplets'})

In [None]:
# --- Mongo Setup ---
_ = load_dotenv(find_dotenv())
mongo_client = MongoClient(os.getenv("MONGO_URI"))
db = mongo_client.get_database("wikidata_ontology")

# --- Extractor Setup ---
# extractor = LLMTripletExtractor(model='gpt-4.1-mini')
aligner = Aligner(db)


jobs_triplets = [{'subject': triple['subject'], 'object': triple['object'], 'relation': triple['predicate'],
                    "subject_type": None, "object_type": None, "sample_id": "wikidata-triplets"} for triple in jobs_triplets]
aligner.add_triplets(jobs_triplets, sample_id='wikidata-triplets')

In [None]:
# apple_triplets = get_head_triplets("Q312") + get_tail_triplets("Q312")
apple_triplets = get_head_triplets("Q312")
apple_df = pd.DataFrame(apple_triplets).drop_duplicates()
apple_df

In [None]:
woznyak_triplets = get_head_triplets("Q483382") + get_tail_triplets("Q483382")
woznyak_df = pd.DataFrame(woznyak_triplets).drop_duplicates()
woznyak_df

In [None]:
pixar_triplets = get_head_triplets("Q127552") + get_tail_triplets("Q127552")
pixar_df = pd.DataFrame(pixar_triplets).drop_duplicates()
pixar_df

In [None]:
next_triplets = get_head_triplets("Q308993") + get_tail_triplets("Q308993")
next_df = pd.DataFrame(next_triplets).drop_duplicates()
next_df

In [None]:
all_df = pd.concat([jobs_df, apple_df, woznyak_df, pixar_df, next_df])
all_df

In [None]:
all_df = all_df.drop_duplicates()
all_df

In [None]:
import requests

def wikidata_id2wikipedia_name(ids):
    
    num_batches = len(ids) // 50 + int(len(ids) % 50 != 0)
    names = {}

    for batch in range(num_batches):
        id_batch = ids[batch*50:batch*50+50]
        id_batch = "|".join(id_batch)
        res = requests.get("https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=sitelinks&ids={}&sitefilter=enwiki".format(id_batch)).json()
        for entity in res['entities']:
            if "sitelinks" in res['entities'][entity] and "enwiki" in res['entities'][entity]["sitelinks"]:
                names[entity] = res['entities'][entity]["sitelinks"]["enwiki"]["title"]
    return names

def get_alternative_labels(entity_id):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = f"""
    SELECT ?item ?itemAltLabel WHERE {{

    VALUES (?subject) {{(wd:{entity_id})}}  
    ?subject skos:altLabel ?itemAltLabel . FILTER (lang(?itemAltLabel) = "en")
 

    }}
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results = results["results"]["bindings"]

    output_labels = []
    for res in results:
        output_labels.append(res['itemAltLabel']['value'])
    
    return output_labels
get_alternative_labels("Q483382")

In [None]:
ids = list(set(list(all_df['subj_id']) + list(all_df['obj_id'])))
wikidata_id2wiki_mapping = wikidata_id2wikipedia_name(ids)
wikidata_id2wiki_mapping

In [None]:
import time

wikidata_id2alternative_name = {}
for id_ in ids:
    wikidata_id2alternative_name[id_] = get_alternative_labels(id_)
    time.sleep(0.3)

In [None]:
len(wikidata_id2alternative_name)

In [None]:
wikidata_id2alternative_name

In [None]:
all_df['wiki_subj'] = all_df['subj_id'].apply(lambda x: wikidata_id2wiki_mapping[x] if x in wikidata_id2wiki_mapping else None)
all_df['wiki_obj'] = all_df['obj_id'].apply(lambda x: wikidata_id2wiki_mapping[x] if x in wikidata_id2wiki_mapping else None)

all_df

In [None]:
all_df = all_df.dropna()
all_df

In [None]:
all_df.to_csv("wikidata_df.csv")

In [None]:
df = pd.read_csv('full_triplets.csv', index_col=0)
df = df.reset_index(drop=True)
# df = df.drop_duplicates()
df

In [None]:
wikipedia_subjects = list(df.subject.unique())
wikipedia_objects = list(df.object.unique())
wikipedia_entities = list(set(wikipedia_subjects + wikipedia_objects))

wikipedia_relations = list(df.relation.unique())
len(wikipedia_entities), len(wikipedia_relations)

In [None]:
aligned_triplets = []
for _, row in all_df.iterrows():

    triplet_alternative = {"subject": row["subject"], "relation": row["predicate"], "object": row["object"]}
    
    subject_alternatives = wikidata_id2alternative_name[row['subj_id']] + [row['wiki_subj'], row['subject']]
    
    for name in subject_alternatives:
        if name in wikipedia_entities:
            triplet_alternative['subject'] = name
            break

    object_alternatives = wikidata_id2alternative_name[row['obj_id']] + [row['wiki_obj'], row['object']]

    for name in object_alternatives:
        if name in wikipedia_entities:
            triplet_alternative['object'] = name
            break
    aligned_triplets.append(triplet_alternative)

aligned_df = pd.DataFrame(aligned_triplets)
aligned_df

In [None]:
wikidata_subjects = list(aligned_df.subject.unique())
wikidata_objects = list(aligned_df.object.unique())
wikidata_entities = list(set(wikidata_subjects + wikidata_objects))

wikidata_relations = list(aligned_df.relation.unique())
len(wikidata_entities), len(wikidata_relations)

## Comparing with the composed KG

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [None]:
device = "cuda:5"
tokenizer = AutoTokenizer.from_pretrained('facebook/contriever')
model = AutoModel.from_pretrained('facebook/contriever').to(device)

In [None]:
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings


def embed_batch(names):
    inputs = tokenizer(names, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs.to(device))
    embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
    return np.array(embeddings.detach().cpu())

In [None]:
embed_batch(["Steve Jobs", "Stephen Woznyak"]).shape

In [None]:
all_df.to_csv("wikidata_df.csv")

In [None]:
len(set(wikidata_entities) & set(wikipedia_entities))

In [None]:
wikipedia_entities_embedded = embed_batch(wikipedia_entities)
wikipedia_entities_embedded.shape

In [None]:
wikipedia_relations_embedded = embed_batch(wikipedia_relations)
wikipedia_relations_embedded.shape

In [None]:
wikidata_entities_embedded = embed_batch(wikidata_entities)
wikidata_entities_embedded.shape

In [None]:
wikidata_relations_embedded = embed_batch(wikidata_relations)
wikidata_relations_embedded.shape

In [None]:
from sklearn import metrics

relation_similarity_matrix = metrics.pairwise.cosine_similarity(wikidata_relations_embedded, wikipedia_relations_embedded)
relation_similarity_matrix.shape

In [None]:
entity_similarity_matrix = metrics.pairwise.cosine_similarity(wikidata_entities_embedded, wikipedia_entities_embedded)
entity_similarity_matrix.shape

In [None]:
best_entity_pairs = np.argmax(entity_similarity_matrix, axis=0)
len(best_entity_pairs)

In [None]:
pedia2data_entity = {}
best_entity_pairs = np.argmax(entity_similarity_matrix, axis=0)

for i, _ in enumerate(wikipedia_entities):
    if entity_similarity_matrix[best_entity_pairs[i]][i] > 0.5:
        pedia2data_entity[wikipedia_entities[i]] = wikidata_entities[best_entity_pairs[i]]
pedia2data_entity

In [None]:
len(pedia2data_entity)

In [None]:
pedia2data_relation = {}
best_relation_pairs = np.argmax(relation_similarity_matrix, axis=0)

for i, _ in enumerate(wikipedia_relations):
    if relation_similarity_matrix[best_relation_pairs[i]][i] > 0.5:
        pedia2data_relation[wikipedia_relations[i]] = wikidata_relations[best_relation_pairs[i]]
pedia2data_relation

In [None]:
len(pedia2data_relation)

In [None]:
triplet_pairs = []

for i, row in df.iterrows():
    data_subj = pedia2data_entity[row['subject']] if row['subject'] in pedia2data_entity else None 
    data_obj = pedia2data_entity[row['object']] if row['object'] in pedia2data_entity else None
    data_rel = pedia2data_relation[row['relation']] if row['relation'] in pedia2data_relation else None

    if data_subj and data_obj and data_rel:
        triplet = aligned_df[(aligned_df['subject'] == data_subj) & (aligned_df['object'] == data_obj) & (aligned_df['relation'] == data_rel)]
        if len(triplet) > 0:
            triplet_pairs.append(((row['subject'], row['relation'], row['object']), (triplet.iloc[0,0], triplet.iloc[0,1], triplet.iloc[0,2])))

In [None]:
len(triplet_pairs), triplet_pairs

In [None]:
wikidata_triplets = list(aligned_df['subject'] +  " " + aligned_df['relation'] + " " + aligned_df['object'])
wikipedia_triplets =  list(df['subject'] +  " " + df['relation'] + " " + df['object'])

In [None]:
wikidata_triplets[:10]

In [None]:
wikipedia_triplets[:10]

In [None]:
set(wikipedia_triplets) & set(wikidata_triplets), len(set(wikipedia_triplets) & set(wikidata_triplets))

In [None]:
wikidata_triplets_embedded = embed_batch(wikidata_triplets)
wikidata_triplets_embedded.shape

In [None]:
wikipedia_triplets_embedded = embed_batch(wikipedia_triplets)
wikipedia_triplets_embedded.shape

In [None]:
triplet_similarity_matrix = metrics.pairwise.cosine_similarity(wikidata_triplets_embedded, wikipedia_triplets_embedded)
triplet_similarity_matrix.shape

In [None]:
best_triplet_pairs = np.argmax(triplet_similarity_matrix, axis=0)
best_triplet_pairs.shape

In [None]:
triplet_pairs = []

for i, _ in enumerate(wikipedia_triplets):
    if triplet_similarity_matrix[best_triplet_pairs[i]][i] > 0.5:
        triplet_pairs.append((wikipedia_triplets[i], wikidata_triplets[best_triplet_pairs[i]]))

In [None]:
len(triplet_pairs)

In [None]:
aligned_df

In [None]:
aligned_df

## Comparing with linked names from wikipedia

In [None]:
pairs = []
pair_count = 0
edge_present = 0

counted_pairs = set()

wikipedia_common_triplets = []
wikidata_common_triplets = []


for _, row in aligned_df.iterrows():

    subj_name = row['subject']
    obj_name = row['object']


    if subj_name in wikipedia_entities and obj_name in wikipedia_entities:
        wikidata_common_triplets.append((row['subject'], row['relation'], row['object']))
        # edge_present += len(df[((df['subject'] == row['wiki_subj']) & (df['object'] == row['wiki_obj'])) | ((df['subject'] == row['wiki_obj']) & (df['object'] == row['wiki_subj']))])
        if (subj_name, obj_name) in counted_pairs or (obj_name, subj_name) in counted_pairs:
            continue
        else:
            edge_present += 1
            counted_pairs.add((subj_name, obj_name))

            intersected_triplets = df[((df['subject'] == subj_name) & (df['object'] == obj_name)) | ((df['subject'] == obj_name) & (df['object'] == subj_name))]

            if len(intersected_triplets) > 0:
                pair_count += 1
                for _, row_ in intersected_triplets.iterrows():
                    # wikidata_common_triplets.append((row['wiki_subj'], row['predicate'], row['wiki_obj']))
                    wikipedia_common_triplets.append((row_['subject'], row_['relation'], row_['object']))
                    # print(row['wiki_subj'], row['predicate'], row['wiki_obj'])
                    # print(row_['subject'], row_['relation'], row_['object'])
                    # print()
                # pair_count += len(intersected_triplets)
            
pair_count, pair_count/edge_present, edge_present

In [None]:
"Up" in wikipedia_entities

In [None]:
df[df['subject'] == 'Up']

In [None]:
for pedia_triplet, data_triplet in zip(wikipedia_common_triplets, wikidata_common_triplets):
    print(pedia_triplet, " | ", data_triplet)

## Neo4j

In [None]:
from neo4j import GraphDatabase

uri = "neo4j://localhost:7687"
username = "neo4j"
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [None]:
# def add_node(tx, node_name):
#     tx.run("CREATE (n:WikidataNode {name: $node_name})", node_name=node_name)

# def add_relation(tx, head, tail, relation):
#     query = f"""
#         MATCH (a {{name: $head}}), (b {{name: $tail}})
#         CREATE (a)-[r:{relation}]->(b)
#         RETURN type(r)
#         """
#     result = tx.run(query, head=head, tail=tail, database_='abc')

# def get_node(tx, name):
#     result = tx.run("MATCH (n:WikidataNode {name: $name}) RETURN n.name AS name", name=name)
#     return [record["name"] for record in result]


# for i, row in all_df.iterrows():
#     head = row['subject']
#     tail = row['object']
#     relation = "_".join(row['predicate'].replace("-", "").replace("/", "").replace("'", "").replace(",", "").replace(".", "").split())
#     # print(head, tail, relation)
#     with driver.session() as session:
#         if not session.read_transaction(get_node, head):
#             session.write_transaction(add_node, head)

#         if not session.read_transaction(get_node, tail):
#             session.write_transaction(add_node, tail)
            
#         session.write_transaction(add_relation, head, tail, relation)

In [None]:
def add_node(tx, node_name):
    tx.run("CREATE (n:WikidataNode {name: $node_name})", node_name=node_name)

def add_relation(tx, head, tail, relation):
    query = f"""
        MATCH (a {{name: $head}}), (b {{name: $tail}})
        CREATE (a)-[r:{relation}]->(b)
        RETURN type(r)
        """
    result = tx.run(query, head=head, tail=tail, database_='abc')

def get_node(tx, name):
    result = tx.run("MATCH (n:WikidataNode {name: $name}) RETURN n.name AS name", name=name)
    return [record["name"] for record in result]


for triplet in wikidata_common_triplets:
    head = triplet[0]
    tail = triplet[2]
    relation = "_".join(triplet[1].replace("-", "").replace("/", "").replace("'", "").replace(",", "").replace(".", "").split())
    # print(head, tail, relation)
    with driver.session() as session:
        if not session.read_transaction(get_node, head):
            session.write_transaction(add_node, head)

        if not session.read_transaction(get_node, tail):
            session.write_transaction(add_node, tail)
            
        session.write_transaction(add_relation, head, tail, relation)

In [None]:
def add_node(tx, node_name):
    tx.run("CREATE (n:WikipediaNode {name: $node_name})", node_name=node_name)

def add_relation(tx, head, tail, relation):
    query = f"""
        MATCH (a:WikipediaNode {{name: $head}}), (b:WikipediaNode {{name: $tail}})
        CREATE (a)-[r:{relation}]->(b)
        RETURN type(r)
        """
    result = tx.run(query, head=head, tail=tail, database_='abc')

def get_node(tx, name):
    result = tx.run("MATCH (n:WikipediaNode {name: $name}) RETURN n.name AS name", name=name)
    return [record["name"] for record in result]


for triplet in wikipedia_common_triplets:
    head = triplet[0]
    tail = triplet[2]
    relation = "_".join(triplet[1].replace("-", "").replace("/", "").replace("'", "").replace(",", "").replace(".", "").split())
    # print(head, tail, relation)
    with driver.session() as session:
        if not session.read_transaction(get_node, head):
            session.write_transaction(add_node, head)

        if not session.read_transaction(get_node, tail):
            session.write_transaction(add_node, tail)
            
        session.write_transaction(add_relation, head, tail, relation)

In [None]:
def delete_all(tx):
    tx.run("MATCH (n) DETACH DELETE n")

with driver.session() as session:
    session.execute_write(delete_all)