In [69]:
import networkx as nx
from collections import defaultdict

In [70]:
def readMeSH(fin):
    """
    Given a file-like object, generates MeSH objects, i.e.
    dictionaries with a list of values for each qualifier.
    Example: {"MH": ["Acetylcysteine"]}
    """
    currentEntry = None
    for line in fin:
        line = line.strip()
        if not line:
            continue
        # Handle new record. MeSH explicitly marks this
        if line == "*NEWRECORD":
            # Yiel old entry, initialize new one
            if currentEntry:
                yield currentEntry
            currentEntry = defaultdict(list)
            continue
        # Line example: "MH = Acetylcysteine"
        key, _, value = line.partition(" = ")
        # Append to value list
        currentEntry[key].append(value)
    # If there is a non-empty entry left, yield it
    if currentEntry:
        yield currentEntry

In [71]:
def add_nodepath_and_label_to_endnode_to_networkx(temp_nx,temp_mesh_entry):
    '''
    We receive a networkx label and a single mesh entry
    we split the MN into multiple labels
    we split each label into a list of perpetually growing strings (A01, A01.032, A01.032,047)
    we add the "word label" at the end from teh MH
    '''

    #MN and MH are 'attributes' in the ascii text file
    #MN is all paths
    #MH is the end node
    nodepath_string_path_list=temp_mesh_entry['MN']
    
    #confirm that we are adding the right label always because there is only one
    if (len(temp_mesh_entry['MH']))>1:
        print(temp_mesh_entry['MH'])
        hold=input('found an entry with multiple labels')
    end_node_label=temp_mesh_entry['MH'][0]


    for temp_string_path in nodepath_string_path_list:
        node_path_elements=temp_string_path.split('.')
        node_paths=list()

        for i in range(0,len(node_path_elements)):
            node_paths.append('.'.join(node_path_elements[0:i+1]))

        #print(node_paths)
        #hold=input('node_paths')

        #if 'A11' in node_paths:
            nx.add_path(temp_nx,node_paths)
        temp_nx.nodes[node_paths[-1]]['mesh_label']=end_node_label

In [72]:
mesh_file_address='../../resources/mesh_ascii_2021.txt'

In [73]:
mesh_networkx=nx.DiGraph()

In [74]:
with open(mesh_file_address, "r") as infile:
    # readMeSH() yields MeSH objects, i.e. dictionaries
    for entry in readMeSH(infile):
         add_nodepath_and_label_to_endnode_to_networkx(mesh_networkx,entry)

In [75]:
#get a list of all of the current "headnodes". which are Letter+two digits
number_string_list=['01','02','03','04','05','06','07','08','09']+[str(i) for i in range(10,51)]
possible_headnodes_list=list()
for i in ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']:
    for j in [str(i) for i in number_string_list]:
        possible_headnodes_list.append(i+j)
headnodes_list=[element for element in possible_headnodes_list if (element in mesh_networkx)]

In [76]:
#create a new set of headnodes
category_headnodes={
    'A':'Anatomy',
    'B':'Organisms',
    'C':'Diseases',
    'D':'Chemicals and Drugs',
    'E':'Analytical, Diagnostic and Therapeutic Techniques, and Equipment',
    'F':'Psychiatry and Psychology',
    'G':'Phenomena and Processes',
    'H':'Disciplines and Occupations',
    'I':'Anthropology, Education, Sociology, and Social Phenomena',
    'J':'Technology, Industry, and Agriculture',
    'K':'Humanities',
    'L':'Information Science',
    'M':'Named Groups',
    'N':'Health Care',
    'V':'Publication Characteristics',
    'Z':'Geographicals'
}

In [77]:
#add the category headnodes
for element in category_headnodes.keys():
    mesh_networkx.add_node(element,mesh_label=category_headnodes[element])

In [78]:
#connect the original headnodes to the category headnodes
temp_edges=list()
for element in headnodes_list:
    temp_edges.append(
        (element[0],element)
    )
print(temp_edges)
mesh_networkx.add_edges_from(temp_edges)

[('A', 'A01'), ('A', 'A02'), ('A', 'A03'), ('A', 'A04'), ('A', 'A05'), ('A', 'A06'), ('A', 'A07'), ('A', 'A08'), ('A', 'A09'), ('A', 'A10'), ('A', 'A11'), ('A', 'A12'), ('A', 'A13'), ('A', 'A14'), ('A', 'A15'), ('A', 'A16'), ('A', 'A17'), ('A', 'A18'), ('A', 'A19'), ('A', 'A20'), ('A', 'A21'), ('B', 'B01'), ('B', 'B02'), ('B', 'B03'), ('B', 'B04'), ('B', 'B05'), ('C', 'C01'), ('C', 'C04'), ('C', 'C05'), ('C', 'C06'), ('C', 'C07'), ('C', 'C08'), ('C', 'C09'), ('C', 'C10'), ('C', 'C11'), ('C', 'C12'), ('C', 'C13'), ('C', 'C14'), ('C', 'C15'), ('C', 'C16'), ('C', 'C17'), ('C', 'C18'), ('C', 'C19'), ('C', 'C20'), ('C', 'C21'), ('C', 'C22'), ('C', 'C23'), ('C', 'C24'), ('C', 'C25'), ('C', 'C26'), ('D', 'D01'), ('D', 'D02'), ('D', 'D03'), ('D', 'D04'), ('D', 'D05'), ('D', 'D06'), ('D', 'D08'), ('D', 'D09'), ('D', 'D10'), ('D', 'D12'), ('D', 'D13'), ('D', 'D20'), ('D', 'D23'), ('D', 'D25'), ('D', 'D26'), ('D', 'D27'), ('E', 'E01'), ('E', 'E02'), ('E', 'E03'), ('E', 'E04'), ('E', 'E05'), ('E',

In [79]:
#add the true headnode
mesh_networkx.add_node('root',mesh_label='root')

In [80]:
#connect the category headnoes to the true headnode
temp_edges=[('root',element) for element in headnodes_list]
mesh_networkx.add_edges_from(temp_edges)


In [83]:
set_that_encapulates_node_id={
    'mesh_label'
}

In [85]:
def flatten(xs):
    '''
    given a list of elements (can contain arbitrarily nested lists)
    creates a generator? of flattned elements
    warning: strings will become lists of char
    '''
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

In [95]:
def create_one_values_to_node_id_dict(temp_nx,temp_node,relevant_node_set):
    '''
    This takes a single node and returns a dict
    where the keys (probably many) are the nested values of the node
    and the value for each key is the node ID
    '''
    one_node_id_dict=dict()
    #we make scientific name the endpoint so that it works like the mesh hierarchies
    #scientific_name=temp_nx.nodes[temp_node][temp_attribute]
    for temp_attribute in relevant_node_set:
        #print(temp_attribute)
        if temp_attribute not in temp_nx.nodes[temp_node].keys():
            continue
        elif isinstance(temp_nx.nodes[temp_node][temp_attribute],str):
            #print(total_ncbi_networkx.nodes[temp_node][temp_attribute])
            one_node_id_dict[temp_nx.nodes[temp_node][temp_attribute]]=temp_node
        else:
            #print(set(flatten(total_ncbi_networkx.nodes[temp_node][temp_attribute])))
            temp_dict={
                element:temp_node for element in set(flatten(temp_nx.nodes[temp_node][temp_attribute]))
            }
            one_node_id_dict.update(temp_dict)
    return one_node_id_dict

In [96]:
def create_all_attribute_to_node_id_dict(temp_nx,relevant_node_set):
    '''
    takes an entire networkx and features that help to differentiate nodes
    returns a dict of {attribute:node_id}
    '''
    total_feature_node_id_dict=dict()
    for i,temp_node in enumerate(temp_nx.nodes):
        total_feature_node_id_dict.update(
            create_one_values_to_node_id_dict(temp_nx,temp_node,relevant_node_set)
        )
        if i >100:
            break
    print(total_feature_node_id_dict)

In [97]:
create_all_attribute_to_node_id_dict(
    mesh_networkx,
    set_that_encapulates_node_id
)

{'Heterocyclic Compounds': 'D03', 'Heterocyclic Compounds, Fused-Ring': 'D03.633', 'Heterocyclic Compounds, 2-Ring': 'D03.633.100', 'Benzoxazoles': 'D03.633.100.221', 'Calcimycin': 'D03.633.100.221.173', 'Organic Chemicals': 'D02', 'Organophosphorus Compounds': 'D02.705', 'Organophosphates': 'D02.705.400', 'Organothiophosphates': 'D02.886.300.692', 'Temefos': 'D02.886.300.692.800', 'Organothiophosphorus Compounds': 'D02.886.300', 'Sulfur Compounds': 'D02.886', 'Technology, Industry, and Agriculture': 'J01', 'Industry': 'J01.576', 'Food Industry': 'J01.576.423', 'Food Handling': 'J01.576.423.200', 'Meat-Packing Industry': 'J01.576.423.200.700', 'Abattoirs': 'J03.540.020', 'Non-Medical Public and Private Facilities': 'J03', 'Manufacturing and Industrial Facilities': 'J03.540', 'Information Science': 'L01', 'Language': 'L01.559', 'Linguistics': 'L01.559.598', 'Terminology as Topic': 'L01.559.598.400', 'Names': 'L01.559.598.400.556', 'Abbreviations as Topic': 'L01.559.598.400.556.131', 'Bo

In [81]:
# irrelevant_categories=['B']
# for element in irrelevant_categories:
#     mesh_networkx.remove_nodes_from(nx.algorithms.dag.descendants(mesh_networkx,element))
#     mesh_networkx.remove_nodes_from(element)

In [82]:
# number_string_list=['01','02','03','04','05','06','07','08','09']+[str(i) for i in range(10,51)]
# total_organ_headnodes_list=[]

# for i in ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']:
#     for j in number_string_list:
#         total_organ_headnodes_list.append(i+j)
# organ_headnodes_list=[i for i in total_organ_headnodes_list if (i in organ_networkx.nodes)]