In [1]:
from collections import defaultdict
import networkx as nx
import utils
from tqdm import tqdm

This notebook used for generating one file:
* topWikiTaxon.txt

In [2]:
valclasses = set()
with open('InstORclass.tsv', 'r') as f:
    for line in f:
        tuple = line.strip().split('\t')
        if len(tuple) > 1 and tuple[1] == '[CLS]':
            valclasses.add(tuple[0])
print("Number of valid classes", len(valclasses))

Number of valid classes 1678186


In [3]:
ori_wikitaxonDown = defaultdict(set)
with open('../wikidata_src/wiki_taxonomy.tsv', 'r') as wikireader:
    for line in wikireader:
        triple = line.strip().split('\t')
        if len(triple) > 3:
            ori_wikitaxonDown[triple[2]].add(triple[0])
# check graph attributes
graph = nx.DiGraph(ori_wikitaxonDown)
print("DAG: ", nx.is_directed_acyclic_graph(graph))
print("Connected: ", nx.is_weakly_connected(graph))
print("MaxDepth: ", max(nx.shortest_path_length(graph, source='wd:Q35120').values()))

DAG:  True
Connected:  True
MaxDepth:  19


In [4]:
# Initialize Root and Top1 classes
root = 'wd:Q35120' # entity
top_wikitaxonDown, top_wikitaxonUp = defaultdict(set), defaultdict(set)
topClasses = ori_wikitaxonDown.get(root, [])
top_wikitaxonDown[root] = topClasses.copy()
top_wikitaxonUp[root] = set()
for c in topClasses:
    top_wikitaxonUp[c].add(root)

In [5]:
global top_wikitaxonDown, top_wikitaxonUp, ori_wikitaxonDown
def addSubClass(superClass, subClass):
    """Adds the Wikidata classes to the wiki clean taxonomy"""
    if subClass not in valclasses:
        return
    top_wikitaxonUp[subClass].add(superClass)
    top_wikitaxonDown[superClass].add(subClass)
    # Avoid adding the subclasses again in case of double inheritance -> save time
    if subClass in top_wikitaxonDown:
        return
    for subClass2 in ori_wikitaxonDown.get(subClass,[]):    
        addSubClass(subClass, subClass2)

In [6]:
for topClass in topClasses: # DFS traversal
    for subclass in ori_wikitaxonDown.get(topClass, []):
        addSubClass(topClass, subclass)

In [7]:
topgraph = nx.DiGraph(top_wikitaxonDown)
print("DAG: ", nx.is_directed_acyclic_graph(topgraph))
print("Connected: ", nx.is_weakly_connected(topgraph))
print("MaxDepth: ", max(nx.shortest_path_length(topgraph, source='wd:Q35120').values()))
print("Top Number of classes: ", topgraph.number_of_nodes()) # only 4% classes kept
print("Ori Number of classes: ", graph.number_of_nodes())

DAG:  True
Connected:  True
MaxDepth:  17
Top Number of classes:  296550
Ori Number of classes:  3962959


In [8]:
# Post-processing of topgraph
cum_cls_inst_stats = defaultdict(int)
with open('cum_cls_inst_stats.txt', 'r') as file:
    for line in file:
        cls, number = line.strip().split('\t')
        cum_cls_inst_stats[cls] = int(number)
print("Number of classes with cumulative instances: ", len(cum_cls_inst_stats))

Number of classes with cumulative instances:  82742


In [9]:
ScholarlyArticle = "wd:Q13442814" # Discard this class
cls_discard = utils.getDescendants(ScholarlyArticle, top_wikitaxonDown)
print("Number of classes to discard under ScholarlyArticle: ", len(cls_discard))

Number of classes to discard under ScholarlyArticle:  60


In [10]:
# Remove classes without cumulative instances
all_classes = list(top_wikitaxonUp.keys())
filtered_top_wikitaxonUp = defaultdict(set)
for c in tqdm(all_classes, desc="Creating Sub-Taxonomy"):
    if c in cum_cls_inst_stats and c not in cls_discard:
        filtered_top_wikitaxonUp[c] = top_wikitaxonUp[c].copy()
print("Number of classes in the filtered taxonomy: ", len(filtered_top_wikitaxonUp))

Creating Sub-Taxonomy: 100%|██████████| 296550/296550 [00:00<00:00, 2033357.18it/s]

Number of classes in the filtered taxonomy:  44903





In [11]:
# Check if important classes are present
filtered_top_wikitaxonDown = defaultdict(set)
for c in filtered_top_wikitaxonUp:
    for parent in filtered_top_wikitaxonUp[c]:
        filtered_top_wikitaxonDown[parent].add(c)
topgraph = nx.DiGraph(filtered_top_wikitaxonDown)

In [12]:
# Graph properties
print("DAG: ", nx.is_directed_acyclic_graph(topgraph))
print("Connected: ", nx.is_weakly_connected(topgraph))
print("MaxDepth: ", max(nx.shortest_path_length(topgraph, source='wd:Q35120').values()))
print("Top Number of classes: ", topgraph.number_of_nodes())

DAG: 

 True
Connected:  True
MaxDepth:  16
Top Number of classes:  44903


* Post-processing
    - Remove transitive links (no need, as no changes made)
    - Remove top1 classes with no subclasses
    - Skip connection for concepts without descriptions
    - Filter non noun-phrases

In [13]:
# Remove top1 classes with no subclasses
root = 'wd:Q35120'
topclss = list(topgraph.successors(root))
for topcls in topclss:
    if topgraph.out_degree(topcls) == 0:
        print(topcls)
        topgraph.remove_node(topcls)
topgraph.number_of_edges(), topgraph.number_of_nodes()

wd:Q122754124
wd:Q3885844
wd:Q115471117
wd:Q25047676
wd:Q203872
wd:Q120725535


(60621, 44897)

In [16]:
cls2desc = {}
with open('../wikidata_src/wiki_taxonomy_description.tsv', 'r') as wikireader:
    for line in wikireader:
        triple = line.strip().split('\t')
        if len(triple) > 3:
            cls2desc[triple[0]] = triple[2][1:-1]

cls2label = {}
with open('../wikidata_src/wiki_taxonomy_labels.tsv', 'r') as wikireader:
    for line in wikireader:
        triple = line.strip().split('\t')
        if len(triple) > 3:
            cls2label[triple[0]] = triple[2][1:-1]

In [17]:
cls_nodesc = set(topgraph.nodes) - set(cls2desc.keys())
print("Number of classes without description: ", len(cls_nodesc))

Number of classes without description:  6636


In [18]:
cls_nodesc

{'wd:Q1751831',
 'wd:Q42349081',
 'wd:Q22807264',
 'wd:Q7569184',
 'wd:Q108710753',
 'wd:Q5784028',
 'wd:Q57980038',
 'wd:Q11598626',
 'wd:Q2194371',
 'wd:Q30921461',
 'wd:Q7783077',
 'wd:Q12302249',
 'wd:Q116986936',
 'wd:Q124434470',
 'wd:Q7972079',
 'wd:Q124693942',
 'wd:Q114794403',
 'wd:Q11489006',
 'wd:Q24567296',
 'wd:Q121114732',
 'wd:Q110279334',
 'wd:Q53001699',
 'wd:Q16002704',
 'wd:Q28130009',
 'wd:Q115668795',
 'wd:Q2817758',
 'wd:Q113469341',
 'wd:Q7257916',
 'wd:Q63864177',
 'wd:Q52558144',
 'wd:Q10593476',
 'wd:Q12858816',
 'wd:Q17773756',
 'wd:Q3060597',
 'wd:Q21438156',
 'wd:Q30504307',
 'wd:Q23795647',
 'wd:Q20180780',
 'wd:Q2463167',
 'wd:Q59662559',
 'wd:Q3535487',
 'wd:Q67711489',
 'wd:Q11606860',
 'wd:Q18901004',
 'wd:Q9292190',
 'wd:Q115561019',
 'wd:Q65048785',
 'wd:Q11426227',
 'wd:Q14292916',
 'wd:Q17218855',
 'wd:Q4375507',
 'wd:Q4500821',
 'wd:Q58718353',
 'wd:Q11420447',
 'wd:Q62401112',
 'wd:Q5258439',
 'wd:Q104176997',
 'wd:Q17205774',
 'wd:Q11625076',
 

In [30]:
# Skip connection for classes without description
for cls in cls_nodesc:
    if not topgraph.has_node(cls):
        continue
    # leaf nodes
    if topgraph.out_degree(cls) == 0:
        topgraph.remove_node(cls)
        continue
    # inner nodes
    children = list(topgraph.successors(cls))
    parents = list(topgraph.predecessors(cls))
    for pc in parents:
        for cc in children:
            topgraph.add_edge(pc, cc)
    topgraph.remove_node(cls)

In [96]:
# Graph properties
print("DAG: ", nx.is_directed_acyclic_graph(topgraph))
print("Connected: ", nx.is_weakly_connected(topgraph))
print("MaxDepth: ", max(nx.shortest_path_length(topgraph, source='wd:Q35120').values()))
# stats
topgraph.number_of_nodes(), topgraph.number_of_edges()

DAG:  True
Connected:  True
MaxDepth:  15


(38261, 52826)

In [42]:
# # Transitive Reduction: Nothing changed
# graph = nx.transitive_reduction(topgraph)
# graph.number_of_edges(), graph.number_of_nodes()

In [95]:
# Check if important classes are still in the graph
# Refer to YAGO4.5 design file: https://yago-knowledge.org/data/yago4.5/
# Creative work; Q17537576; Q386724
print("-Creative work-", "IncludeCls:", topgraph.has_node('wd:Q2424752'), 
      " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q17537576'))
print("-Creative work-", "IncludeCls:", topgraph.has_node('wd:Q386724'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q386724'))
# Book; Q571
print("-Book-", "IncludeCls:", topgraph.has_node('wd:Q571'), 
      " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q571'))
# Newspaper; Q11032
print("-Newspaper-", "IncludeCls:", topgraph.has_node('wd:Q11032'), 
      " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q11032'))
# TVSeries; Q5398426
print("-TVSeries-", "IncludeCls:", topgraph.has_node('wd:Q5398426'), 
      " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q5398426'))
# MusicComposition; Q207628; Q2188189
print("-MusicComposition-", "IncludeCls:", topgraph.has_node('wd:Q207628'))
print("-MusicComposition-", "IncludeCls:", topgraph.has_node('wd:Q2188189'))
print("-Music-", "IncludeCls:", topgraph.has_node('wd:Q638'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q638'))
# Movie, Q11424
print("-Movie-", "IncludeCls:", topgraph.has_node('wd:Q11424'), 
      " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q11424'))
# Event, Q1190554; Q1656682
print("-Event-", "IncludeCls:", topgraph.has_node('wd:Q1190554'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q1190554'))
print("-Event-", "IncludeCls:", topgraph.has_node('wd:Q1656682'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q1656682'))
# Election, Q40231
print("-Election-", "IncludeCls:", topgraph.has_node('wd:Q40231'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q40231'))
# Organization, Q43229
print("-Organization-", "IncludeCls:", topgraph.has_node('wd:Q43229'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q43229'))
# Corporation, Q783794; Q4830453
print("-Corporation-", "IncludeCls:", topgraph.has_node('wd:Q783794'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q783794'))
print("-Corporation-", "IncludeCls:", topgraph.has_node('wd:Q4830453'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q4830453'))
# Airline, Q46970
print("-Airline-", "IncludeCls:", topgraph.has_node('wd:Q46970'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q46970'))
# EducationalOrganization, Q5341295
print("-EducationalOrganization-", "IncludeCls:", topgraph.has_node('wd:Q5341295'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q5341295'))
# PerformingGroup, Q105815710
print("-PerformingGroup-", "IncludeCls:", topgraph.has_node('wd:Q105815710'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q105815710'))
# MusicGroup, Q2088357
print("-MusicGroup-", "IncludeCls:", topgraph.has_node('wd:Q2088357'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q2088357'))
# Person, Q5; Q215627
# print("-Person-", "IncludeCls:", topgraph.has_node('wd:Q5'),
#         " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q5'))
print("-Person-", "IncludeCls:", topgraph.has_node('wd:Q215627'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q215627'))
# Worker, Q702269, Q327055
print("-Worker-", "IncludeCls:", topgraph.has_node('wd:Q702269'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q702269'))
print("-Worker-", "IncludeCls:", topgraph.has_node('wd:Q327055'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q327055'))
# Creator, Q483501
print("-Creator-", "IncludeCls:", topgraph.has_node('wd:Q483501'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q483501'))
# Academic, Q66666685
print("-Academic-", "IncludeCls:", topgraph.has_node('wd:Q66666685'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q66666685'))
# SportsPerson, Q50995749
print("-SportsPerson-", "IncludeCls:", topgraph.has_node('wd:Q50995749'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q50995749'))
# Politician, Q82955
print("-Politician-", "IncludeCls:", topgraph.has_node('wd:Q82955'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q82955'))
# AdministrativeArea, Q56061
print("-AdministrativeArea-", "IncludeCls:", topgraph.has_node('wd:Q56061'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q56061'))
# # City, Q515, Q7930989
# print("-City-", "IncludeCls:", topgraph.has_node('wd:Q515'),
#         " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q515'))
# Country, Q6256
print("-Country-", "IncludeCls:", topgraph.has_node('wd:Q6256'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q6256'))
# Landform, Q14524493, Q205895, Q2221906, Q35145263, Q271669, Q3622002, Q82794
print("-Landform-", "IncludeCls:", topgraph.has_node('wd:Q14524493'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q14524493'))
print("-Landform-", "IncludeCls:", topgraph.has_node('wd:Q205895'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q205895'))
print("-Landform-", "IncludeCls:", topgraph.has_node('wd:Q2221906')) # geographic location -> P31:type of property
print("-Landform-", "IncludeCls:", topgraph.has_node('wd:Q35145263'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q35145263'))
print("-Landform-", "IncludeCls:", topgraph.has_node('wd:Q271669'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q271669'))
print("-Landform-", "IncludeCls:", topgraph.has_node('wd:Q3622002'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q3622002'))
print("-Landform-", "IncludeCls:", topgraph.has_node('wd:Q82794'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q82794'))
# BodyOfWater, Q15324, Q116126039
print("-BodyOfWater-", "IncludeCls:", topgraph.has_node('wd:Q15324'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q15324'))
print("-BodyOfWater-", "IncludeCls:", topgraph.has_node('wd:Q116126039'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q116126039'))
# Continent, Q5107
print("-Continent-", "IncludeCls:", topgraph.has_node('wd:Q5107'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q5107'))
# HumanMadeGeographicalEntity, Q811979, Q811430, Q811463, Q35145743
print("-HumanMadeGeographicalEntity-", "IncludeCls:", topgraph.has_node('wd:Q811979'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q811979'))
print("-HumanMadeGeographicalEntity-", "IncludeCls:", topgraph.has_node('wd:Q811430'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q811430'))
print("-HumanMadeGeographicalEntity-", "IncludeCls:", topgraph.has_node('wd:Q811463'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q811463'))
print("-HumanMadeGeographicalEntity-", "IncludeCls:", topgraph.has_node('wd:Q35145743'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q35145743'))
# Way, Q83620
print("-Way-", "IncludeCls:", topgraph.has_node('wd:Q83620'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q83620'))
# Airport, Q1248784
print("-Airport-", "IncludeCls:", topgraph.has_node('wd:Q1248784'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q1248784'))
# AstronomicalObject, Q6999
print("-AstronomicalObject-", "IncludeCls:", topgraph.has_node('wd:Q6999'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q6999'))
# Product, Q2424752
print("-Product-", "IncludeCls:", topgraph.has_node('wd:Q2424752'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q2424752'))
# # Taxon, Q16521
# print("-Taxon-", "IncludeCls:", topgraph.has_node('wd:Q16521'),
#         " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q16521'))
# Award, Q618779
print("-Award-", "IncludeCls:", topgraph.has_node('wd:Q618779'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q618779'))
# belief system (Q5390013)
print("-belief system-", "IncludeCls:", topgraph.has_node('wd:Q5390013'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q5390013'))
# Language, Q34770
print("-Language-", "IncludeCls:", topgraph.has_node('wd:Q34770'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q34770'))
# Gender, Q48264
print('-Gender-', "IncludeCls:", topgraph.has_node('wd:Q48264'))
# FictionalEntity, Q14897293, Q115537581, Q64728693, Q18706315, Q21070598
print("-FictionalEntity-", "IncludeCls:", topgraph.has_node('wd:Q14897293'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q14897293'))
print("-FictionalEntity-", "IncludeCls:", topgraph.has_node('wd:Q115537581')) # imaginary character P279 agent(removed)
print("-FictionalEntity-", "IncludeCls:", topgraph.has_node('wd:Q64728693'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q64728693'))
print("-FictionalEntity-", "IncludeCls:", topgraph.has_node('wd:Q18706315'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q18706315'))
print("-FictionalEntity-", "IncludeCls:", topgraph.has_node('wd:Q21070598'),
        " **Depth: ", nx.shortest_path_length(topgraph, source='wd:Q35120', target='wd:Q21070598'))

-Creative work- IncludeCls: True  **Depth:  5
-Creative work- IncludeCls: True  **Depth:  3
-Book- IncludeCls: True  **Depth:  3
-Newspaper- IncludeCls: True  **Depth:  5
-TVSeries- IncludeCls: True  **Depth:  6
-MusicComposition- IncludeCls: True
-MusicComposition- IncludeCls: True
-Music- IncludeCls: True  **Depth:  5
-Movie- IncludeCls: True  **Depth:  6
-Event- IncludeCls: True  **Depth:  4
-Event- IncludeCls: True  **Depth:  5
-Election- IncludeCls: True  **Depth:  5
-Organization- IncludeCls: True  **Depth:  4
-Corporation- IncludeCls: True  **Depth:  6
-Corporation- IncludeCls: True  **Depth:  5
-Airline- IncludeCls: True  **Depth:  8
-EducationalOrganization- IncludeCls: True  **Depth:  5
-PerformingGroup- IncludeCls: True  **Depth:  5
-MusicGroup- IncludeCls: True  **Depth:  6
-Person- IncludeCls: True  **Depth:  4
-Worker- IncludeCls: True  **Depth:  6
-Worker- IncludeCls: True  **Depth:  5
-Creator- IncludeCls: True  **Depth:  6
-Academic- IncludeCls: True  **Depth:  7
-Spor

In [97]:
# Write the topWikiTaxon to file
topwikiTaxonDown = nx.to_dict_of_lists(topgraph)
with open('topWikiTaxonUp.tsv', 'w') as file:
    for parent in topwikiTaxonDown:
        for child in topwikiTaxonDown[parent]:
            file.write(f"{child}\t{parent}\n")

* Visualization

In [102]:
cls2label = {}
with open('../wikidata_src/wiki_taxonomy_labels.tsv', 'r') as wikireader:
    for line in wikireader:
        triple = line.strip().split('\t')
        if len(triple) > 3:
            cls2label[triple[0]] = triple[2][1:-1]

In [32]:
def generate_html(node, taxonomy):
    html = '<ul>'
    for child in taxonomy.get(node, []):
        html += f'<li><span class="toggle" onclick="toggleChildren(this)">&#9660;</span>{cls2label[child]}({child[3:]})<ul class="children">'
        html += generate_html(child, taxonomy)
        html += '</ul></li>'
    html += '</ul>'
    return html

In [33]:
# Create the HTML content
html_content = generate_html('wd:Q35120', topwikiTaxonDown)

# Generate the complete HTML file
html_template = f'''
<!DOCTYPE html>
<html>
<head>
    <style>
        ul {{
            list-style-type: none;
        }}
        li {{
            padding-left: 10px;
        }}
        .toggle {{
            cursor: pointer;
            color: black;
        }}
        .children {{
            display: none;
        }}
    </style>
    <script>
        function toggleChildren(element) {{
            var ul = element.nextElementSibling;
            if (ul.style.display === 'none' || ul.style.display === '') {{
                ul.style.display = 'block';
                element.textContent = '▶';
                element.style.color = 'blue';
            }} else {{
                ul.style.display = 'none';
                element.textContent = '▼';
                element.style.color = 'black';
            }}
        }}
    </script>
</head>
<body>
    <h1>Wikidata Taxonomy</h1>
    {html_content}
</body>
</html>
'''

# Write the HTML content to a file
with open("topWikiTaxon.html", "w") as html_file:
    html_file.write(html_template)

print("HTML file generated: class_hierarchy.html")

HTML file generated: class_hierarchy.html


* Code: 2024-06-03: check for 4M classes (oriwikiup), not the extracted wiki_taxonomy.tsv

In [6]:
from collections import defaultdict
oriwikiup = defaultdict(set)
path = '/home/infres/ypeng-21/work/Taxon_clean/build_dataset/test/build_data_from_nt/24_03_22/raw_data/oriwikiup.txt'
with open(path, 'r') as file:
    for line in file:
        tupl = line.strip().split('\t')
        if len(tupl) < 2:
            continue
        else:
            child, parent = line.strip().split('\t')
            oriwikiup[child].add(parent)
oriwikiup['Q35120'] = set()

In [8]:
oriclasses = set()
for cls in oriwikiup.keys():
    oriclasses.add('wd:'+cls)

In [3]:
valclasses = set()
with open('InstORclass.tsv', 'r') as f:
    for line in f:
        tuple = line.strip().split('\t')
        if len(tuple) > 1 and tuple[1] == '[CLS]':
            valclasses.add(tuple[0])
print("Number of valid classes", len(valclasses))

Number of valid classes 1678186


In [13]:
cls_after_inst_or_cls = oriclasses.intersection(valclasses)

In [10]:
def load_label(path):
    cls2label = {}
    with open(path, 'r') as f:
        for line in f:
            # wd:Q96196524 rdfs:label "current entity" .
            triple = line.strip().split('\t')
            if len(triple) > 3:
                cls2label[triple[0]] = triple[2][1:-1]
    return cls2label

def load_desc(path):
    cls2desc = {}
    with open(path, 'r') as f:
        for line in f:
            # wd:Q96196524 schema:description "current bodies" .
            triple = line.strip().split('\t')
            if len(triple) > 3:
                cls2desc[triple[0]] = triple[2][1:-1]
    return cls2desc

In [15]:
label_path = '/home/infres/ypeng-21/work/Taxon_clean/build_dataset/test/build_data_from_nt/raw_data/raw_wikiTaxonomy_labels.tsv' # raw data
# desc_path = '/home/infres/ypeng-21/work/Taxon_clean/build_dataset/test/build_data_from_nt/24_03_22/raw_data/wikidata_src/wiki_taxonomy_description.tsv'
cls2label = load_label(label_path)
# cls2desc = load_desc(desc_path)

In [17]:
len(set(cls2label.keys()).intersection(cls_after_inst_or_cls))

1641749