In [1]:
import nltk
import numpy as np
from gensim.models.wrappers import FastText
import re
import os, sys
import networkx as nx

%load_ext autoreload
%autoreload 2

# np.set_printoptions(suppress=True, formatter={'float_kind':'{:0.4f}'.format})


In [2]:
model = FastText.load_fasttext_format('/datadrive/fastText-pretrained-embedding/fastText.wiki.en.bin')


In [3]:
class Taxon(object):
    def __init__(self, tx_id, rank=-1, norm_name="none", display_name="None", main_type="", level="-100", p_count=0, c_count=0, create_date="None"):
        self.tx_id = tx_id
        self.rank = int(rank)
        self.norm_name = norm_name
        self.display_name = display_name
        self.main_type = main_type
        self.level = int(level)
        self.p_count = int(p_count)
        self.c_count = int(c_count)
        self.create_date = create_date
        
    def __str__(self):
        return "Taxon {} (name: {}, level: {})".format(self.tx_id, self.norm_name, self.level)
        
    def __lt__(self, another_taxon):
        if self.level < another_taxon.level:
            return True
        else:
            return self.rank < another_taxon.rank

        
class Taxonomy(object):
    def __init__(self, name="", node_list=None, edge_list=None):
        self.name = name
        self.graph = nx.DiGraph()
        self.tx_id2taxon = {}
        self.root = None
        
    def __str__(self):
        return f"=== Taxonomy {self.name} ===\nNumber of nodes: {self.graph.number_of_nodes()}\nNumber of edges: {self.graph.number_of_edges()}"
    
    def get_number_of_nodes(self):
        return self.graph.number_of_nodes()

    def get_number_of_edges(self):
        return self.graph.number_of_edges()
    
    def get_nodes(self):
        """
        return: a generator of nodes
        """
        return self.graph.nodes()
    
    def get_edges(self):
        """
        return: a generator of edges
        """
        return self.graph.edges()
    
    def get_root_node(self):
        """
        return: a taxon object
        """
        if not self.root:
            self.root = list(nx.topological_sort(self.graph))[0]
        return self.root
    
    def get_leaf_nodes(self):
        """
        return: a list of taxon objects
        """
        leaf_nodes = []
        for node in self.graph.nodes():
            if self.graph.out_degree(node) == 0:
                leaf_nodes.append(node)
        return leaf_nodes
    
    def get_children(self, parent_node):
        """
        parent_node: a taxon object
        return: a list of taxon object representing the children taxons
        """
        assert parent_node in self.graph, "parent node not in taxonomy"
        return [edge[1] for edge in self.graph.out_edges(parent_node)]
    
    def get_parent(self, child_node):
        """
        child_node: a taxon object
        return: a list of taxon object representing the parent taxons
        """
        assert child_node in self.graph, "child node not in taxonomy"
        return [edge[0] for edge in self.graph.in_edges(child_node)]
    
    def get_descendants(self, parent_node):
        """
        parent_node: a taxon object
        return: a list of taxon object representing the descendant taxons
        """
        assert parent_node in self.graph, "parent node not in taxonomy"
        return list(nx.descendants(self.graph, parent_node))
    
    def get_ancestors(self, child_node):
        """
        child_node: a taxon object
        return: a list of taxon object representing the ancestor taxons
        """
        assert child_node in self.graph, "child node not in taxonomy"
        return list(nx.ancestors(self.graph, child_node))
    
    def is_valid_DAG(self):
        return nx.is_directed_acyclic_graph(self.graph)
    
    def is_weakly_connected(self):
        return nx.number_weakly_connected_components(self.graph) == 1
    
    def get_max_depth(self):
        return nx.dag_longest_path_length(self.graph)
    
    def add_node(self, node):
        self.graph.add_node(node)
        self.tx_id2taxon[node.tx_id] = node
        
    def add_edge(self, start, end):
        """
        start: a taxon object
        end: a taxon object
        """
        self.graph.add_edge(start, end)
    
    def add_nodes_from_list(self, node_list):
        self.graph.add_nodes_from(node_list)
        for node in node_list:
            self.tx_id2taxon[node.tx_id] = node

    def add_edges_from_list(self, edge_list):
        """
        edge_list: a list of taxon object pairs from parent_taxon -> child_taxon
        """
        self.graph.add_edges_from(edge_list)
    

# Sample CS Field-of-study Taxonomy

In [199]:
taxonomy = Taxonomy(name="mag-cs-fos")
tx_id2taxon = {}
with open("/home/t-jishen/mag-taxonomy/FieldsOfStudy.txt" , "r") as fin:
    for line in fin:
        line = line.strip()
        if line:
            segs = line.split("\t")
            assert len(segs) == 9, "Wrong number of segmentations"
            taxon = Taxon(tx_id=segs[0], display_name=segs[3], norm_name=segs[2])
            tx_id2taxon[segs[0]] = taxon 
            taxonomy.add_node(taxon)

In [200]:
with open("/home/t-jishen/mag-taxonomy/FieldOfStudyChildren.txt", "r") as fin:
    for line in fin:
        line = line.strip()
        if line:
            segs = line.split()
            assert len(segs) == 2, "Wrong number of segmentations"
            parent_taxon = tx_id2taxon[segs[0]]
            child_taxon = tx_id2taxon[segs[1]]
            taxonomy.add_edge(parent_taxon, child_taxon)

In [232]:
dag = sample_dag(taxonomy, tx_id2taxon['41008148'], depth_limit=7)

In [239]:
leaf_node_cnt = 0
for node in dag:
    if dag.out_degree(node) == 0:
        leaf_node_cnt += 1
leaf_node_cnt

24508

In [228]:
def sample_dag(taxonomy, source_node, depth_limit=7):
    subgraph_nodes = taxonomy.get_descendants(source_node)
    subgraph_nodes.append(source_node)
    return taxonomy.graph.subgraph(subgraph_nodes)


In [241]:
interested_fields = ["computer science", "mathematics"]
for interested_field in interested_fields: 
    # find the source taxon
    for tx_id, taxon in tx_id2taxon.items():
        if taxon.norm_name == interested_field:
            source_taxon = taxon
            print(source_taxon)
            break
    
    # edges = list(nx.bfs_edges(taxonomy.graph, source=source_taxon, depth_limit=7))
    # tree = nx.DiGraph()
    # tree.add_edges_from(edges)
    dag = sample_dag(taxonomy, source_taxon, depth_limit=7)
    print(f"Number of nodes: {dag.number_of_nodes()}")
    print(f"Number of edges: {dag.number_of_edges()}")
    
    field_name = "_".join(interested_field.split())
    with open(f"../data/MAG_FoS/{field_name}.terms", "w") as fout:
        for node in dag:
            fout.write(f"{node.tx_id}\t{node.norm_name}\n")
            
    with open(f"../data/MAG_FoS/{field_name}.taxo", "w") as fout:
        for edge in dag.edges():
            fout.write(f"{edge[0].tx_id}\t{edge[1].tx_id}\n")

Taxon 41008148 (name: computer science, level: -100)
Number of nodes: 29654
Number of edges: 46248
Taxon 33923547 (name: mathematics, level: -100)
Number of nodes: 21365
Number of edges: 34270


## Save to Embedding files 

In [10]:
word_w_no_embed2description = {
    'x3d': "3d computer graphics file format", 
    '576i': "a standard-definition video mode" , 
    'n100': "a large, negative-going evoked potential", 
    's5': "s5 model logic", 
    'p600': "p600 neuroscience erp",
    'p3b': "a subcompoent of the p300",
    '4b3t': "4 binary 3 ternary", 
    'p3a': "novelty p3, component of of time-locked signals", 
    'z39 50': "a client-server standard and an application layer communications protocol", 
    'i2o': "intelligent input/output",
    'x3j13': "ansi common lisp subcommittee",
    '2b1q': "four-level pulse amplitude modulation scheme", 
    '5g': "fifth generation cellular network technology", 
    '3g 324m': "3gpp umbrella protocal for video telephony", 
    '6to4': "an internet transition mechanism for migrating from ipv4 to ipv6", 
    'x86': "a family of instruction set architectures", 
    '480p': "480 display resolutions", 
    '576p': "576 display resolutions", 
    'n400': "a component of event-related potentials", 
    '6in4': "an internet transition mechanism for migrating from ipv4 to ipv6", 
    '4320p': "4320 display resolutions", 
    '2 1 2d': "pseudo 3d", 
    'f16c': "cvt16 instruction set", 
    'x87': "a floating point related subset of x86 instruction set", 
    'c37 94': "ieee c37.94 optical interface", 
    '4in6': "tunneling of IPv4 in IPv6", 
    'c11': "a past standard for the c programming language",
    "e8": "an exceptional simple lie groups in mathematics"
}

In [11]:
data_dir = "../data/MAG_FoS"
for filename in os.listdir(data_dir):
    if filename.endswith(".terms"):
        file_path = os.path.join(data_dir, filename)
        
        # generated node embedding feature
        idx2term = {}
        term2idx = {}
        idx2embed = {}
        with open(file_path, "r") as fin:
            for line in fin:
                line = line.strip()
                if line:
                    segs = line.split("\t")
                    idx = segs[0]
                    term = segs[1].lower()
                    idx2term[idx] = term
                    term2idx[term] = idx
                    embed = np.zeros(model.vector_size)
                    token_list = re.split(r"\s|, |-", term)
                    for token in token_list:
                        if token in model:  # whenever a character ngrams appaer in the token
                            embed += model[token]
                    embed /= len(token_list)
                    if np.sum(embed ** 2) == 0:
                        # print(idx, term)  # uncomment this line to find oov word
                        embed = np.zeros(model.vector_size)
                        alternative_term_form = word_w_no_embed2description[term]
                        token_list = re.split(r"\s|, |-", alternative_term_form)
                        for token in token_list:
                            if token in model:  # whenever a character ngrams appaer in the token
                                embed += model[token]
                        embed /= len(token_list)
                    idx2embed[idx] = embed
        
        print(f"{file_path}, number of terms: {len(idx2embed)}")
        
        # write to file
        with open(f"{file_path}.embed", "w") as fout:
            fout.write(f"{len(idx2embed)} {model.vector_size}\n")
            for ele in sorted(idx2embed.items(), key=lambda x:x[0]):
                embed_string = " ".join([str(a) for a in ele[1]])
                fout.write(f"{ele[0]} {embed_string}\n")

../data/MAG_FoS/mathematics.terms, number of terms: 21365
../data/MAG_FoS/computer_science.terms, number of terms: 29654
