In [76]:
import json
f = open('dend.json', 'r')
j = json.loads(f.read())
j.keys()



dict_keys(['node_attributes', 'children'])

In [77]:
# Bifurcation of tree at root

len(j['children'])

2

In [3]:
# Inspect  root node:

j['node_attributes']

[{'members': 75,
  'midpoint': 57.5288,
  'height': 0.9146,
  'label': '',
  'cell_set_accession': 'CS201908210_76',
  'original_label': 'n1',
  'cell_set_label': 'All cells',
  'cell_set_preferred_alias': '',
  'cell_set_aligned_alias': '',
  'cell_set_additional_aliases': '',
  'cell_set_structure': 'neocortex',
  'cell_set_alias_assignee': 'Trygve Bakken||',
  'cell_set_alias_citation': '10.1038/s41586-019-1506-7||',
  'cell_set_ontology_tag': 'UBERON:0001950',
  'taxonomy_id': 'CCN201908210',
  '_row': 'n1'}]

In [73]:
import warnings

out = {}

def tree_recurse(tree, out, parent_node_id = ''):
    """Convert Allen Taxonomy JSON to a list of nodes and edges, where nodes are
    Copies of nodes in Allen JSON & edges are duples - (subject(child), object(parent))
    identified by 'cell_set_accession'.
    
    Args:
        - Tree: Allen taxonomy in JSON, or some subtree of it
        - Output structure to populate (starting point must be an empty dict)
        - parent_node_id: 'cell_set_accession' of parent.  Default (no parent) = root.
        
    """
    if not out:
        out['nodes']= []
        out['edges'] = set()
    if 'node_attributes' in tree.keys():
        if len(tree['node_attributes']) > 1:
            warnings.warn("Don't know how to deal with multiple nodes per recurse")
        ID = tree['node_attributes'][0]['cell_set_accession']
            
        out['nodes'].append(tree['node_attributes'][0])
        if parent_node_id:
            out['edges'].add((ID, parent_node_id))
        if 'children' in tree.keys():
            for c in tree['children']:
                tree_recurse(c, out, parent_node_id = ID)
        else:
            warnings.warn("non leaf node %s has no children" % ID)
    elif 'leaf_attributes' in tree.keys():
        if len(tree['leaf_attributes']) > 1:
            warnings.warn("Don't know how to deal with multiple nodes per recurse")       
        ID = tree['leaf_attributes'][0]['cell_set_accession']
        tree['leaf_attributes'][0]['is_leaf'] = True
        out['nodes'].append(tree['leaf_attributes'][0])
        out['edges'].add((ID, parent_node_id))
        if 'children' in tree.keys():
            warnings.warn('leaf node %s has children!' % ID)
    else:
        warnings.warn("No recognized nodes")
        
tree_recurse(j, out)

In [74]:
len(out['nodes'])

149

In [39]:
## Check for unique attribute values

def unique_attribute_test(nodes):
    # Assuming root node has all keys
    for attr in nodes[0].keys():
        a = {n[attr] for n in out['nodes'] if attr in n.keys()}
        if len(nodes) == len(a):
            print("%s, e.g. '%s' is a unique attribute" % (attr, nodes[1][attr]))
            
unique_attribute_test(out['nodes'])

cell_set_accession, e.g. 'CS201908210_77' is a unique attribute
original_label, e.g. 'n2' is a unique attribute
cell_set_label, e.g. 'MTG 001-069' is a unique attribute
_row, e.g. 'n2' is a unique attribute


In [87]:
lookup = { n['cell_set_accession']: n['cell_set_preferred_alias'] for n in out['nodes']}

# Quick look at parents
for k,v in lookup.items():
    parent = [e[1] for e in out['edges'] if e[0] == k]
    if parent and lookup[parent[0]] and v:
        print((v, lookup[parent[0]]))
    

('Inhibitory', 'Neuronal')
('ADARB2 (CGE)', 'Inhibitory')
('LAMP5/PAX6', 'ADARB2 (CGE)')
('VIP', 'ADARB2 (CGE)')
('LHX6 (MGE)', 'Inhibitory')
('Inh L5-6 SST TH', 'SST')
('Inh L5-6 LHX6 GLP1R', 'PVALB')
('Inh L2-5 PVALB SCUBE3', 'LHX6 (MGE)')
('Excitatory', 'Neuronal')
('Exc L2-3 LINC00507 FREM3', 'L2/3')
('L4', 'RORB')
('Exc L3-4 RORB CARM1P1', 'L4')
('Exc L5-6 THEMIS DCSTAMP', 'L6')
('FEZF2', 'Excitatory')
('Exc L4-6 FEZF2 IL26', 'FEZF2')
('L6', 'FEZF2')
('Exc L5-6 FEZF2 ABO', 'L6')
('Micro L1-6 TYROBP', 'Non-neuronal')


In [None]:
# Lets go with cell_set_label as rdfs_label for now.  It may be worth inventing some APs for the others...

# Alternative: 
   # Everything with a cell_set_preferred_alias gets the name "$cell_set_preferred_alias - $taxonomy_id. 
   # We make also classes for these
   # The rest we name "$cell_set_label - $taxonomy_id"
   # other identifiers go in synonyms (at least for now)

In [56]:
robot_template_seed = { 'ID': 'ID', 
                       'Label': 'LABEL',
                       'PrefLabel': 'skos:prefLabel',
                       'Entity Type': 'TYPE', 
                       'Property Assertions': 'I subcluster_of SPLIT=|',
                       'Synonyms': 'A oio:has_exact_synonym',
                      }



In [33]:
import pandas as pd

robot_template = pd.DataFrame.from_records([robot_template_seed])
robot_template

Unnamed: 0,ID,Label,Entity Type,Property Assertions,Synonyms
0,ID,A rdfs:label,cluster,I subcluster_of,A oio:has_exact_synonym


In [67]:
dl = [robot_template_seed]

synonym_properties = ['original_label', 
                      'cell_set_aligned_alias',
                      'cell_set_additional_aliases']
for o in out['nodes']:
    d = {}
    d['ID'] = o['cell_set_accession']
    d['Label'] = o['cell_set_label']
    d['PrefLabel'] = o['cell_set_preferred_alias']
    d['Entity Type'] = 'Cluster'
    d['Synonyms'] = '|'.join([o[prop] for prop in synonym_properties if o[prop]])
    d['Property Assertions'] = '|'.join([e[1] for e in out['edges'] if e[0] == d['ID']]) # There should only be one!
    dl.append(d)

robot_template = pd.DataFrame.from_records(dl)
print(robot_template[['ID','PrefLabel']][0:50])
print(robot_template[['ID','PrefLabel']][51:100])
print(robot_template[['ID','PrefLabel']][100:-1])

    
    

                 ID                PrefLabel
0                ID           skos:prefLabel
1    CS201908210_76                         
2    CS201908210_77                 Neuronal
3    CS201908210_78               Inhibitory
4    CS201908210_79             ADARB2 (CGE)
5    CS201908210_80               LAMP5/PAX6
6    CS201908210_81                         
7     CS201908210_1      Inh L1-2 PAX6 CDH12
8     CS201908210_2  Inh L1-2 PAX6 TNFAIP8L3
9    CS201908210_82                         
10    CS201908210_3        Inh L1 LAMP5 NMBR
11   CS201908210_83                         
12   CS201908210_84                         
13    CS201908210_4      Inh L1-4 LAMP5 LCP2
14    CS201908210_5       Inh L1-2 LAMP5 DBP
15    CS201908210_6       Inh L2-6 LAMP5 CA1
16   CS201908210_85                      VIP
17   CS201908210_86                         
18   CS201908210_87                         
19   CS201908210_88                         
20   CS201908210_89                         
21    CS20

In [75]:
robot_template.to_csv("CCN201908210.tsv", sep="\t")

In [None]:
# Constuct helper files

## helper ontology  - just needs declarations and labels

# Class: cell cluster (we have one in VFB but should be generalised - maybe cell set?)
# OP subcluster_of
# capable_of

## JSON LD Curie map - see default here: https://github.com/ontodev/robot/blob/master/robot-core/src/main/resources/obo_context.jsonld

