Executed query:
```sparql
SELECT ?item ?itemLabel ?itemDescription ?subject ?subjectLabel WHERE {
  VALUES ?field { wd:Q2267705 wd:Q11862829 }
  
  ?subject (wdt:P31|wdt:P279) ?field.
  ?item (wdt:P361|wdt:P2579) ?subject.
  ?subject rdfs:label ?subjectLabel.
  FILTER (LANG(?subjectLabel) = "en" && !REGEX(?subjectLabel, "^Q[0-9]+$"))
  
  OPTIONAL { ?item schema:description ?itemDescription. FILTER(LANG(?itemDescription) = "en") }
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
```

In [43]:
import requests

url = 'https://query.wikidata.org/sparql'
query = '''
SELECT ?item ?itemLabel ?itemDescription (GROUP_CONCAT(DISTINCT ?subjectLabel; SEPARATOR=", ") AS ?subjectLabels) (GROUP_CONCAT(DISTINCT ?subitemLabel; SEPARATOR=", ") AS ?subitemLabels) WHERE {
  VALUES ?subject { wd:Q131476 wd:Q21198 wd:Q11205 wd:Q4809258}  # Graph theory, Computer science, Arithmetics
  
  #?subject (wdt:P31|wdt:P279) ?field.
  ?item (wdt:P361|wdt:P2579|wdt:P279) ?subject.
  
  ?item schema:description ?itemDescription.
  FILTER(LANG(?itemDescription) = "en")
  
  ?subject rdfs:label ?subjectLabel.
  FILTER (LANG(?subjectLabel) = "en" && !REGEX(?subjectLabel, "^Q[0-9]+$"))
  
  OPTIONAL { ?subitem wdt:P279 ?item.
             ?subitem rdfs:label ?subitemLabel.
             FILTER(LANG(?subitemLabel) = "en") }
  
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
} GROUP BY ?item ?itemLabel ?itemDescription
'''
r = requests.get(url, params = {'format': 'json', 'query': query})
print(r)
data = r.json()

<Response [200]>


Obtaining a list of all academic disciplines with their parent disciplines.

In [28]:
import requests
import pandas as pd

url = 'https://query.wikidata.org/sparql'

disciplines_query = '''SELECT ?discipline ?disciplineLabel ?disciplineDescription 
       (GROUP_CONCAT(DISTINCT ?parentLabel; separator=";") AS ?parentDisciplines) 
WHERE {
  ?discipline wdt:P31 wd:Q11862829.
  ?discipline rdfs:label ?disciplineLabel.
  
  # Ensure the discipline has at least one "studied by" (P2579) incoming relationship
  FILTER EXISTS { ?concept (wdt:P366|wdt:P31|wdt:P2579) ?discipline. }
  
  # Get discipline description
  OPTIONAL {
    ?discipline schema:description ?disciplineDescription.
    FILTER(LANG(?disciplineDescription) = "en")
  }
  
  # Find parent disciplines (P361 - "part of", P1269 - "facet of")
  OPTIONAL {
    ?discipline (wdt:P361|wdt:P1269) ?parentDiscipline.
    ?parentDiscipline wdt:P31 wd:Q11862829.  # Ensure parent is also an academic discipline
    ?parentDiscipline rdfs:label ?parentLabel.
    FILTER(LANG(?parentLabel) = "en")
  }
  
  # Ensure the discipline label is in English
  FILTER(LANG(?disciplineLabel) = "en")
  FILTER(!REGEX(STR(?disciplineLabel), "^Q[0-9]+$")) # Exclude labels that look like QIDs
} 
GROUP BY ?discipline ?disciplineLabel ?disciplineDescription
ORDER BY ?disciplineLabel
'''
r = requests.get(url, params = {'format': 'json', 'query': disciplines_query})
data = r.json()
df = pd.DataFrame(data['results']['bindings'])
for column_name in df.columns:
	df[column_name] = df[column_name].apply(lambda x: x['value'] if isinstance(x, dict) else '')
df.to_csv("wikidata_disciplines.csv")

In [None]:
import pandas as pd
import requests
import time

df = pd.read_csv("wikidata_disciplines.csv")

WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"

def fetch_related_concepts(discipline_uri):
    query = f"""
    SELECT (GROUP_CONCAT(DISTINCT ?conceptLabel; separator=";") AS ?relatedConcepts) WHERE {{
      ?concept (wdt:P361|wdt:P2579|wdt:P279|wdt:P31) <{discipline_uri}>.
      ?concept rdfs:label ?conceptLabel.
      FILTER(LANG(?conceptLabel) = "en")
    }}
    """
    response = requests.get(WIKIDATA_SPARQL_URL, params={"query": query, "format": "json"},)
    
    if response.status_code == 200:
        data = response.json()
        if "results" in data and "bindings" in data["results"]:
            bindings = data["results"]["bindings"]
            if bindings:
                return bindings[0].get("relatedConcepts", {}).get("value", "")
    return ""

df["relatedConcepts"] = ""

for idx, row in df.iterrows():
    discipline_uri = row["discipline"]
    related_concepts = fetch_related_concepts(discipline_uri)
    df.at[idx, "relatedConcepts"] = related_concepts
    print(f"Processed {row['disciplineLabel']} - Related Concepts: {related_concepts}")
    
    # Respect Wikidata's rate limits
    time.sleep(1)  # 1-second delay to avoid excessive requests

# Save results to a new CSV file
# df.to_csv("wikidata_disciplines_with_related_concepts.csv", index=False)
print("Saved related concepts to 'wikidata_disciplines_with_related_concepts.csv'.")


In [9]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the CSV file
df = pd.read_csv("wikidata_disciplines_with_related_concepts.csv")

# Load the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to create a meaningful text representation
def create_text_representation(row):
    if pd.notna(row["disciplineDescription"]):  # Check if description exists
        return f"{row['disciplineLabel']}: {row['disciplineDescription']}"
    return row["disciplineLabel"]

# Generate text representations
df["text_representation"] = df.apply(create_text_representation, axis=1)

# Compute embeddings using both name and description
df["embedding"] = df["text_representation"].apply(lambda x: model.encode(x))

# Convert list of embeddings to a NumPy array
embeddings_matrix = np.vstack(df["embedding"].values)

# Save embeddings and corresponding discipline data
np.save("discipline_embeddings.npy", embeddings_matrix)  # Save embeddings
# df[["discipline", "disciplineLabel", "disciplineDescription"]].to_csv("disciplines_metadata.csv", index=False)  # Save metadata



In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

df_meta = pd.read_csv("wikidata_disciplines.csv")  # Discipline metadata
embeddings_matrix = np.load("discipline_embeddings.npy")  # Precomputed embeddings

def find_relevant_disciplines(course_name, top_n=10):
    course_embedding = model.encode(course_name).reshape(1, -1)  # Compute course embedding
    similarities = cosine_similarity(course_embedding, embeddings_matrix)[0]
    df_meta["similarity"] = similarities
    top_disciplines = df_meta.sort_values(by="similarity", ascending=False).head(top_n)
    
    return top_disciplines

# Example:
relevant_disciplines = find_relevant_disciplines("Logic in Computer Science: This is a lecture on foundations of Logic for Computer Science. Introduction to Logic and symbolic Knowledge Representation and Reasoning. Propositional Calculus: syntax, semantics, logical implication, inference, theorem proving. Design and analysis of logical models. Normal forms (CNF, DNF). The SAT problem. First Order Predicate Calculus: syntax, semantics, logical implication, inference, theorem proving. Design and analysis of logical models in FOPC. Normal forms (CNF, DNF). Resolution and Dual Resolution theorem proving. Introduction to Logic Programming and Constraint Programming.")
print(relevant_disciplines)


      Unnamed: 0                                discipline  \
332          332   http://www.wikidata.org/entity/Q2555318   
696          696   http://www.wikidata.org/entity/Q1003009   
540          540    http://www.wikidata.org/entity/Q387196   
475          475     http://www.wikidata.org/entity/Q21198   
695          695   http://www.wikidata.org/entity/Q3984091   
476          476  http://www.wikidata.org/entity/Q11492827   
1438        1438   http://www.wikidata.org/entity/Q2878974   
471          471    http://www.wikidata.org/entity/Q428691   
1440        1440    http://www.wikidata.org/entity/Q844718   
474          474     http://www.wikidata.org/entity/Q80006   

                       disciplineLabel  \
332                automated reasoning   
696                       formal logic   
540                  description logic   
475                   computer science   
695             formal language theory   
476   computer science and engineering   
1438      theoretical c

In [73]:
import os
import json
import time
import requests
from typing import List, Optional
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


class WikidataEntity:
    def __init__(self, qid: str, label: str, description: str = ""):
        self.qid = qid
        self.label = label
        self.description = description
        self.url = f"https://www.wikidata.org/wiki/{qid}"

    def __str__(self):
        return f"Qid: {self.qid}, label: {self.label}, description: {self.description}"

    def __repr__(self):
        return f"WikidataEntity('{self.qid}', '{self.label}', '{self.description}')"

    def to_dict(self):
        return {"qid": self.qid, "label": self.label, "description": self.description}

    @staticmethod
    def from_dict(data):
        return WikidataEntity(data["qid"], data["label"], data.get("description", ""))


class WikidataMatcher:
    def __init__(
        self, similarity_threshold=0.3, cache_file: str = "wikidata_cache.json"
    ):
        self.cache_file = cache_file
        self.cache = self.load_cache()
        self.max_nr_of_trials = 3
        self.similarity_threshold = similarity_threshold
        self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    def load_cache(self):
        if os.path.exists(self.cache_file):
            with open(self.cache_file, "r", encoding="utf-8") as f:
                data = json.load(f)
            return {k: WikidataEntity.from_dict(v) for k, v in data.items()}
        else:
            return {}

    def save_cache(self):
        with open(self.cache_file, "w", encoding="utf-8") as f:
            json.dump(
                {k: v.to_dict() for k, v in self.cache.items()},
                f,
                indent=2,
                ensure_ascii=False,
            )

    def search_entity(self, query: str, course_name: str) -> Optional[WikidataEntity]:
        # Check local cache first
        if query in self.cache:
            return self.cache[query]

        # If not found, query Wikidata API
        entities = self._search_wikidata(query, course_name)

        if entities:
            entity = entities[0]  # Take the best matching entity
            if entity:
                self.cache[query] = entity
            self.save_cache()
            return entity
        else:
            return None

    def _search_wikidata(self, query: str, course_name:str) -> List[WikidataEntity]:
        search_url = "https://www.wikidata.org/w/api.php"
        params = {
            "action": "query",
            "list": "search",
            "srsearch": query,
            "format": "json",
            "srlimit": 5,
            "srnamespace": "0",
        }
        trial_nr = 1
        while trial_nr <= self.max_nr_of_trials:
            response = requests.get(search_url, params=params)
            if response.status_code != 200:
                trial_nr += 1
                print("sleeping...")
                time.sleep(5)
                continue
                # response.raise_for_status()

            data = response.json()
            if "query" not in data or "search" not in data["query"]:
                return []

            titles = [item["title"] for item in data["query"]["search"]]
            entities = self._get_entities_details(titles)

            if not entities:
                return []
            
            filtered_entities = self._filter_entities(entities)

            if not filtered_entities:
                print(f"No valid Wikidata entities after filtering for query '{query}'.")
                return []

            return [self._get_best_match(query, course_name, filtered_entities)]

    def _get_best_match(
        self, query: str, course_name:str, candidates: List[WikidataEntity]
    ) -> WikidataEntity | None:
        query_embedding = self.embedding_model.encode(f"{query}")
        course_embedding = self.embedding_model.encode(f"{course_name}")

        scored = []
        for entity in candidates:
            text = (
                f"{entity.label}: {entity.description}"
                if entity.description
                else entity.label
            )

            candidate_embedding = self.embedding_model.encode(text)
            query_similarity = cosine_similarity([query_embedding], [candidate_embedding])[0][
                0
            ]
            course_similarity = cosine_similarity([course_embedding], [candidate_embedding])[0][
                0
            ]
            if query_similarity >= self.similarity_threshold or course_similarity >= self.similarity_threshold:
                scored.append((entity, query_similarity, course_similarity))
        print(scored)
        # Sort descending by similarity
        scored.sort(key=lambda x: x[1], reverse=True)
        if len(scored) == 0: return None

        best_match, best_query_score, best_course_score = scored[0]
        print(
            f"Best match for '{query}' = '{best_match.label}' (score: {best_query_score:.2f})"
        )

        return best_match

    def _get_entities_details(self, qids: List[str]) -> List[WikidataEntity]:
        if not qids:
            return []

        sparql_url = "https://query.wikidata.org/sparql"
        ids_formatted = " ".join(f"wd:{qid}" for qid in qids)
        query = f"""
        SELECT DISTINCT ?item ?itemLabel ?itemDescription
        WHERE {{
            VALUES ?item {{ {ids_formatted} }} 
            FILTER NOT EXISTS {{
                ?item wdt:P31 ?type .
                FILTER (?type IN (wd:Q7725634, wd:Q18918145, wd:Q13442814, wd:Q1368848))
            }}
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
        """
        trial_nr = 1
        while trial_nr <= self.max_nr_of_trials:
            r = requests.get(sparql_url, params={"format": "json", "query": query})
            if r.status_code != 200:
                trial_nr += 1
                print("sleeping sparql ...")
                time.sleep(5)
                continue

            data = r.json()
            entities = {}

            for item in data["results"]["bindings"]:
                label = item.get("itemLabel", {}).get("value", "")
                description = item.get("itemDescription", {}).get("value", "")
                qid = (
                    item.get("item", {})
                    .get("value")
                    .replace("http://www.wikidata.org/entity/", "")
                )
                entities[qid] = WikidataEntity(qid, label, description)

            return [entities[qid] for qid in qids if qid in entities]
    
    def _filter_entities(self, entities: List[WikidataEntity]) -> List[WikidataEntity]:
        filtered = []
        for e in entities:
            label = e.label or ""
            description = (e.description or "").lower()
            if label == "" or description == "": continue

            if "category:" in label.lower():
                continue
            if len(label.split()) > 4:
                continue
            if "scientific article" in description or "scholarly article" in description:
                continue

            filtered.append(e)
        return filtered


In [74]:
wikidata_matcher = WikidataMatcher()
wikidata_matcher.search_entity('function (mathematics)', 'Mathematical analysis')

[(WikidataEntity('Q1967302', 'Membership function', 'Membership function (mathematics)'), np.float32(0.49406463), np.float32(0.25334942)), (WikidataEntity('Q11348', 'function', 'association of a single output to each input'), np.float32(0.3674376), np.float32(0.17672753)), (WikidataEntity('Q7754', 'mathematical analysis', 'branch of mathematics'), np.float32(0.43703473), np.float32(0.7466788))]
Best match for 'function (mathematics)' = 'Membership function' (score: 0.49)


WikidataEntity('Q1967302', 'Membership function', 'Membership function (mathematics)')

In [None]:
"scholarly article", "scientific article"