In [25]:
import nltk
import numpy as np
from gensim.models.wrappers import FastText
import re
import os, sys
import networkx as nx
from gensim.models import KeyedVectors
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

# np.set_printoptions(suppress=True, formatter={'float_kind':'{:0.4f}'.format})


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
class Taxon(object):
    def __init__(self, tx_id, rank=-1, norm_name="none", display_name="None", main_type="", level="-100", p_count=0, c_count=0, create_date="None"):
        self.tx_id = tx_id
        self.rank = int(rank)
        self.norm_name = norm_name
        self.display_name = display_name
        self.main_type = main_type
        self.level = int(level)
        self.p_count = int(p_count)
        self.c_count = int(c_count)
        self.create_date = create_date
        
    def __str__(self):
        return "Taxon {} (name: {}, level: {})".format(self.tx_id, self.norm_name, self.level)
        
    def __lt__(self, another_taxon):
        if self.level < another_taxon.level:
            return True
        else:
            return self.rank < another_taxon.rank

        
class Taxonomy(object):
    def __init__(self, name="", node_list=None, edge_list=None):
        self.name = name
        self.graph = nx.DiGraph()
        self.tx_id2taxon = {}
        self.root = None
        
    def __str__(self):
        return f"=== Taxonomy {self.name} ===\nNumber of nodes: {self.graph.number_of_nodes()}\nNumber of edges: {self.graph.number_of_edges()}"
    
    def get_number_of_nodes(self):
        return self.graph.number_of_nodes()

    def get_number_of_edges(self):
        return self.graph.number_of_edges()
    
    def get_nodes(self):
        """
        return: a generator of nodes
        """
        return self.graph.nodes()
    
    def get_edges(self):
        """
        return: a generator of edges
        """
        return self.graph.edges()
    
    def get_root_node(self):
        """
        return: a taxon object
        """
        if not self.root:
            self.root = list(nx.topological_sort(self.graph))[0]
        return self.root
    
    def get_leaf_nodes(self):
        """
        return: a list of taxon objects
        """
        leaf_nodes = []
        for node in self.graph.nodes():
            if self.graph.out_degree(node) == 0:
                leaf_nodes.append(node)
        return leaf_nodes
    
    def get_children(self, parent_node):
        """
        parent_node: a taxon object
        return: a list of taxon object representing the children taxons
        """
        assert parent_node in self.graph, "parent node not in taxonomy"
        return [edge[1] for edge in self.graph.out_edges(parent_node)]
    
    def get_parent(self, child_node):
        """
        child_node: a taxon object
        return: a list of taxon object representing the parent taxons
        """
        assert child_node in self.graph, "child node not in taxonomy"
        return [edge[0] for edge in self.graph.in_edges(child_node)]
    
    def get_descendants(self, parent_node):
        """
        parent_node: a taxon object
        return: a list of taxon object representing the descendant taxons
        """
        assert parent_node in self.graph, "parent node not in taxonomy"
        return list(nx.descendants(self.graph, parent_node))
    
    def get_ancestors(self, child_node):
        """
        child_node: a taxon object
        return: a list of taxon object representing the ancestor taxons
        """
        assert child_node in self.graph, "child node not in taxonomy"
        return list(nx.ancestors(self.graph, child_node))
    
    def is_valid_DAG(self):
        return nx.is_directed_acyclic_graph(self.graph)
    
    def is_weakly_connected(self):
        return nx.number_weakly_connected_components(self.graph) == 1
    
    def get_max_depth(self):
        return nx.dag_longest_path_length(self.graph)
    
    def add_node(self, node):
        self.graph.add_node(node)
        self.tx_id2taxon[node.tx_id] = node
        
    def add_edge(self, start, end):
        """
        start: a taxon object
        end: a taxon object
        """
        self.graph.add_edge(start, end)
    
    def add_nodes_from_list(self, node_list):
        self.graph.add_nodes_from(node_list)
        for node in node_list:
            self.tx_id2taxon[node.tx_id] = node

    def add_edges_from_list(self, edge_list):
        """
        edge_list: a list of taxon object pairs from parent_taxon -> child_taxon
        """
        self.graph.add_edges_from(edge_list)
    

In [18]:
taxonomy = Taxonomy(name="mag-cs-fos")
tx_id2taxon = {}
with open("/home/t-jishen/mag-taxonomy/FieldsOfStudy.txt" , "r") as fin:
    for line in fin:
        line = line.strip()
        if line:
            segs = line.split("\t")
            assert len(segs) == 9, "Wrong number of segmentations"
            taxon = Taxon(tx_id=segs[0], display_name=segs[3], norm_name=segs[2], level=segs[5])
            tx_id2taxon[segs[0]] = taxon 
            taxonomy.add_node(taxon)

In [19]:
with open("/home/t-jishen/mag-taxonomy/FieldOfStudyChildren.txt", "r") as fin:
    for line in fin:
        line = line.strip()
        if line:
            segs = line.split()
            assert len(segs) == 2, "Wrong number of segmentations"
            parent_taxon = tx_id2taxon[segs[0]]
            child_taxon = tx_id2taxon[segs[1]]
            taxonomy.add_edge(parent_taxon, child_taxon)

In [20]:
non_oprhan_nodes = []
for node in taxonomy.graph.nodes():
    if taxonomy.graph.in_degree(node) != 0 or taxonomy.graph.out_degree(node) != 0:
        non_oprhan_nodes.append(node)

In [32]:
fos_taxonomy = taxonomy.graph.subgraph(non_oprhan_nodes).copy()

In [39]:
L0_nodes = []
selected_nodes = []
for node in fos_taxonomy.nodes():
    if node.level == 0:
        print(node)
        L0_nodes.append(node)
        selected_nodes.append(node)
        selected_nodes.extend(list(nx.descendants(fos_taxonomy, node)))

selected_nodes = list(set(selected_nodes))
final_fos_taxonomy = fos_taxonomy.subgraph(selected_nodes).copy()

# list(nx.descendants(self.graph, parent_node))
# def sample_dag(taxonomy, source_node, depth_limit=7):
#     subgraph_nodes = taxonomy.get_descendants(source_node)
#     subgraph_nodes.append(source_node)
#     return taxonomy.graph.subgraph(subgraph_nodes)

# for node in fos_taxonomy.nodes():
#     if fos_taxonomy.in_degree(node) == 0:
#         print(node)
# # [len(c) for c in sorted(nx.connected_components(fos_taxonomy), key=len, reverse=True)]

Taxon 95457728 (name: history, level: 0)
Taxon 127313418 (name: geology, level: 0)
Taxon 162324750 (name: economics, level: 0)
Taxon 205649164 (name: geography, level: 0)
Taxon 185592680 (name: chemistry, level: 0)
Taxon 138885662 (name: philosophy, level: 0)
Taxon 144024400 (name: sociology, level: 0)
Taxon 192562407 (name: materials science, level: 0)
Taxon 33923547 (name: mathematics, level: 0)
Taxon 86803240 (name: biology, level: 0)
Taxon 41008148 (name: computer science, level: 0)
Taxon 17744445 (name: political science, level: 0)
Taxon 127413603 (name: engineering, level: 0)
Taxon 15744967 (name: psychology, level: 0)
Taxon 39432304 (name: environmental science, level: 0)
Taxon 144133560 (name: business, level: 0)
Taxon 121332964 (name: physics, level: 0)
Taxon 71924100 (name: medicine, level: 0)
Taxon 142362112 (name: art, level: 0)


In [96]:
final_fos_taxonomy.number_of_nodes()

431416

In [98]:
final_fos_taxonomy.number_of_edges()

698743

In [97]:
leaf_node_cnt = 0
for node in final_fos_taxonomy:
    if final_fos_taxonomy.out_degree(node) == 0:
        leaf_node_cnt += 1
leaf_node_cnt

378044

In [99]:
with open(f"/datadrive/structure_expan/data/MAG_FoS/mag_field_of_studies.terms", "w") as fout:
    for node in final_fos_taxonomy:
        fout.write(f"{node.tx_id}\t{node.norm_name}\n")

with open(f"/datadrive/structure_expan/data/MAG_FoS/mag_field_of_studies.taxo", "w") as fout:
    for edge in final_fos_taxonomy.edges():
        fout.write(f"{edge[0].tx_id}\t{edge[1].tx_id}\n")

# word2vec definition embeddings

In [100]:
wiki2seqId = {}
with open("/datadrive/large_data/FosWikiIdToSeqId.txt", "r") as fin:
    for line in fin:
        line = line.strip()
        if line:
            segs = line.split("\t")
            wiki2seqId[segs[0]] = int(segs[1])

In [101]:
tx_id2name = {}
with open("/datadrive/structure_expan/data/MAG_FoS/mag_field_of_studies.terms", "r") as fin:
    for line in fin:
        line = line.strip()
        if line:
            segs = line.split("\t")
            tx_id2name[int(segs[0])] = segs[1]

In [102]:
seqId2embed = {}
with open("/datadrive/large_data/MAG_pretrained_definition_word2vec.tsv", "r") as fin:
    for idx, line in enumerate(fin):
        if idx % 100000 == 0:
            print(idx)
        line = line.strip()
        if line:
            segs = line.split("\t")
            wikiId = segs[0]
            if wikiId in wiki2seqId:
                seqId = wiki2seqId[wikiId]
                seqId2embed[seqId] = segs[1]

0
100000
200000
300000
400000
500000
600000
700000


In [103]:
tx_id2embed = {}
for tx_id in tx_id2name:
    if tx_id in seqId2embed:
        tx_id2embed[tx_id] = seqId2embed[tx_id]

In [113]:
len(tx_id2name)

431416

In [114]:
len(tx_id2embed)

431409

In [115]:
with open("/datadrive/structure_expan/data/MAG_FoS/mag_field_of_studies.terms.word2vec.def.embed", "w") as fout:
    fout.write(f"{len(tx_id2name)} 250\n")
    for seqId in tx_id2name:
        if seqId in tx_id2embed:
            fout.write(f"{seqId} {tx_id2embed[seqId]}\n")
        else:
            embed_string = " ".join(["-1000" for _ in range(250)])
            fout.write(f"{seqId} {embed_string}\n")

In [116]:
wv = KeyedVectors.load_word2vec_format("/datadrive/structure_expan/data/MAG_FoS/mag_field_of_studies.terms.word2vec.def.embed")

In [118]:
(wv.vectors.sum(axis=1) == -1000 * 250).sum()

227

In [117]:
wv.vectors.shape

(431416, 250)

In [120]:
for tx_id in tx_id2embed:
    embed_string = tx_id2embed[tx_id]
    embed_sum = np.array([float(ele) for ele in embed_string.split(" ")]).sum()
    if embed_sum == -1000 * 250:
        print(tx_id, tx_id2name[tx_id])

2910279918 biogeneses
2909612880 glasiosite
2909504036 probothriocephalus
2909024353 spathosterninae
2910428536 acylaminopenicillin
2910123521 hemithiridoidea
2910554443 balanitides
2911073012 schedotrioza
2909499090 ossiannilssonola
2909845446 laminaridextrins
2911101801 cytocuprein
2908863618 corneregel
2910440600 pheochromocyte
2911137917 dermamoeba
2909216835 dasybranchus
2908804317 pyelostomies
2910965902 lycopodioideae
2909525876 diakineses
2910310286 paxyodon syrmatophorus
2909612950 disconectes
2909088580 antiantibodies
2908883531 aspidoptera phyllostomatis
2909182676 paladent 20
2909893743 pseudopodisma
2910564829 chromatonema
2911184412 chromoplasm
2909165327 nematoblast
2909340679 cassidulinoides parkerianus
2908778711 hemicycliostyla
2910899768 procapritermes
2909255693 leprotintinnus
2909570606 dependoviruses
2910623702 paranchistus
2908846738 chondroplasties
2909338513 allodrilus
2910692645 hydraclear
2910868477 paenarthrobacter
2908626016 cyclochoroidectomy
2909111635 su

In [124]:
nx.number_weakly_connected_components(final_fos_taxonomy)

1