In [33]:
import pandas as pd
import warnings



def tree_recurse(tree, out, parent_node_id = ''):
    """Convert Allen Taxonomy JSON to a list of nodes and edges, where nodes are
    Copies of nodes in Allen JSON & edges are duples - (subject(child), object(parent))
    identified by 'cell_set_accession'.
    
    Args:
        - Tree: Allen taxonomy in JSON, or some subtree of it
        - Output structure to populate (starting point must be an empty dict)
        - parent_node_id: 'cell_set_accession' of parent.  Default (no parent) = root.
        
    """
    if not out:
        out['nodes']= []
        out['edges'] = set()
    if 'node_attributes' in tree.keys():
        if len(tree['node_attributes']) > 1:
            warnings.warn("Don't know how to deal with multiple nodes per recurse")
        ID = tree['node_attributes'][0]['cell_set_accession']
            
        out['nodes'].append(tree['node_attributes'][0])
        if parent_node_id:
            out['edges'].add((ID, parent_node_id))
        if 'children' in tree.keys():
            for c in tree['children']:
                tree_recurse(c, out, parent_node_id = ID)
        else:
            warnings.warn("non leaf node %s has no children" % ID)
    elif 'leaf_attributes' in tree.keys():
        if len(tree['leaf_attributes']) > 1:
            warnings.warn("Don't know how to deal with multiple nodes per recurse")       
        ID = tree['leaf_attributes'][0]['cell_set_accession']
        # Tag leaves
        tree['leaf_attributes'][0]['is_leaf'] = True
        out['nodes'].append(tree['leaf_attributes'][0])
        out['edges'].add((ID, parent_node_id))
        if 'children' in tree.keys():
            warnings.warn('leaf node %s has children!' % ID)
    else:
        warnings.warn("No recognized nodes")

In [34]:
CCN202002013_tab = pd.read_csv('../src/dendrograms/nomenclature_table_CCN202002013.csv', sep=',')

In [35]:
CCN202002013_tab.columns

Index(['cell_set_preferred_alias', 'original_label', 'cell_set_label',
       'cell_set_accession', 'cell_set_aligned_alias',
       'cell_set_additional_alias', 'cell_set_alias_assignee',
       'cell_set_alias_citation', 'cell_set_structure',
       'cell_set_ontology_tag', 'taxonomy_id', 'species', 'modality',
       'child_cell_set_accessions'],
      dtype='object')

In [36]:
import json
f = open('../src/dendrograms/CCN202002013.json', 'r')
CCN202002013_dend = json.loads(f.read())

In [37]:
CCN202002013_nodes_edges = {}
tree_recurse(CCN202002013_dend, CCN202002013_nodes_edges)

In [38]:
dendrogram_nodes = [n['cell_set_accession'] for n in CCN202002013_nodes_edges['nodes']]
tab_nodes = CCN202002013_tab['cell_set_accession']
nodes_not_in_dend = set(tab_nodes) - set(dendrogram_nodes)

In [39]:
nodes_not_in_dend

{'CS202002013_232',
 'CS202002013_233',
 'CS202002013_234',
 'CS202002013_235',
 'CS202002013_236',
 'CS202002013_237',
 'CS202002013_238',
 'CS202002013_239',
 'CS202002013_240',
 'CS202002013_241',
 'CS202002013_242',
 'CS202002013_243',
 'CS202002013_244',
 'CS202002013_245',
 'CS202002013_246',
 'CS202002013_247',
 'CS202002013_248',
 'CS202002013_249',
 'CS202002013_250',
 'CS202002013_251',
 'CS202002013_252',
 'CS202002013_253',
 'CS202002013_254',
 'CS202002013_255',
 'CS202002013_256',
 'CS202002013_257',
 'CS202002013_258',
 'CS202002013_259',
 'CS202002013_260',
 'CS202002013_261'}

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [40]:
CCN202002013_tab.set_index('cell_set_accession', inplace=True)

In [47]:
CCN202002013_tab.loc[nodes_not_in_dend]['cell_set_preferred_alias'].sort_index()


cell_set_accession
CS202002013_232                        Sst
CS202002013_233                      Pvalb
CS202002013_234          Non-IT projecting
CS202002013_235               Non-Neuronal
CS202002013_236                  Oligo-OPC
CS202002013_237                        NaN
CS202002013_238                   Other NN
CS202002013_239                    Lamp5_2
CS202002013_240                     Sncg_3
CS202002013_241                     Sncg_1
CS202002013_242                      Vip_3
CS202002013_243                      Vip_2
CS202002013_244                      Vip_1
CS202002013_245                      Sst_2
CS202002013_246                      Sst_3
CS202002013_247                      Sst_6
CS202002013_248                    Pvalb_1
CS202002013_249                    Pvalb_2
CS202002013_250                    L5 IT_1
CS202002013_251                    L6 IT_2
CS202002013_252                    L5 ET_1
CS202002013_253                    L6 CT_1
CS202002013_254                    