### Background

* AIM Convert dend.json to:
   * A set of OWL individuals linked by subClusterOf relationships.
   * Links from OWL Individuals to functions (and morphologies?)
   * A (more limited) set of OWL Classes whose classificatin is driven by subclustering & links from Individuals to functions/morphologies - see
   
* Test data in this notebook comes from Allen consensus clustering of MTG cells provided by Jeremy Miller.  Here is a visualisation of the dendrogram:

![image](https://user-images.githubusercontent.com/112839/95180282-abf16400-07b9-11eb-9b8d-9f84a7abf757.png)


In [None]:
type_mappings = { 'Neuronal' : '',
                   '': 'Astrocyte'
                   '': 'Oligodendrocyte',
                   '': 'Endothelial cell'}

function_mappings = { 'Inhibitory': '', # Mapped to GABA-ergic
                      'Excitatory': '', # Mapped to Glutamatergic,
                    }

In [76]:
import json
f = open('dend.json', 'r')
j = json.loads(f.read())
j.keys()



dict_keys(['node_attributes', 'children'])

In [77]:
# Bifurcation of tree at root

len(j['children'])

2

In [3]:
# Inspect  root node:

j['node_attributes']

[{'members': 75,
  'midpoint': 57.5288,
  'height': 0.9146,
  'label': '',
  'cell_set_accession': 'CS201908210_76',
  'original_label': 'n1',
  'cell_set_label': 'All cells',
  'cell_set_preferred_alias': '',
  'cell_set_aligned_alias': '',
  'cell_set_additional_aliases': '',
  'cell_set_structure': 'neocortex',
  'cell_set_alias_assignee': 'Trygve Bakken||',
  'cell_set_alias_citation': '10.1038/s41586-019-1506-7||',
  'cell_set_ontology_tag': 'UBERON:0001950',
  'taxonomy_id': 'CCN201908210',
  '_row': 'n1'}]

In [115]:
# A simple transformation to a list of nodes and edges for easier processing (Ultimately this should be JSON-LD)

import warnings

out = {}

def tree_recurse(tree, out, parent_node_id = ''):
    """Convert Allen Taxonomy JSON to a list of nodes and edges, where nodes are
    Copies of nodes in Allen JSON & edges are duples - (subject(child), object(parent))
    identified by 'cell_set_accession'.
    
    Args:
        - Tree: Allen taxonomy in JSON, or some subtree of it
        - Output structure to populate (starting point must be an empty dict)
        - parent_node_id: 'cell_set_accession' of parent.  Default (no parent) = root.
        
    """
    if not out:
        out['nodes']= []
        out['edges'] = set()
    if 'node_attributes' in tree.keys():
        if len(tree['node_attributes']) > 1:
            warnings.warn("Don't know how to deal with multiple nodes per recurse")
        ID = tree['node_attributes'][0]['cell_set_accession']
            
        out['nodes'].append(tree['node_attributes'][0])
        if parent_node_id:
            out['edges'].add((ID, parent_node_id))
        if 'children' in tree.keys():
            for c in tree['children']:
                tree_recurse(c, out, parent_node_id = ID)
        else:
            warnings.warn("non leaf node %s has no children" % ID)
    elif ≈:
        if len(tree['leaf_attributes']) > 1:
            warnings.warn("Don't know how to deal with multiple nodes per recurse")       
        ID = tree['leaf_attributes'][0]['cell_set_accession']
        # Tag leaves
        tree['leaf_attributes'][0]['is_leaf'] = True
        out['nodes'].append(tree['leaf_attributes'][0])
        out['edges'].add((ID, parent_node_id))
        if 'children' in tree.keys():
            warnings.warn('leaf node %s has children!' % ID)
    else:
        warnings.warn("No recognized nodes")
        
tree_recurse(j, out)

In [116]:
len(out['nodes'])

149

In [123]:
list(out['edges'])[0:10]

[('CS201908210_104', 'CS201908210_102'),
 ('CS201908210_89', 'CS201908210_88'),
 ('CS201908210_34', 'CS201908210_111'),
 ('CS201908210_111', 'CS201908210_110'),
 ('CS201908210_143', 'CS201908210_142'),
 ('CS201908210_78', 'CS201908210_77'),
 ('CS201908210_47', 'CS201908210_128'),
 ('CS201908210_108', 'CS201908210_107'),
 ('CS201908210_12', 'CS201908210_91'),
 ('CS201908210_136', 'CS201908210_135')]

In [126]:
 '|'.join([e[1] for e in out['edges'] if e[0] == 'CS201908210_104'])

'CS201908210_102'

In [155]:
# view paths to root (single inheritance assumed):

def ascend(node_id, edges, path):
    p = [e[1] for e in edges if e[0] == node_id]
    if not path:
        path.append(node_id)
    if p:
        path.append(p[0])
        ascend(p[0], edges, path)
        
def id_conv(path, nodes):
    out = []
    for p in path:
        pref_alias = [n['cell_set_preferred_alias'] for n in nodes 
                      if (n['cell_set_accession'] == p) and n['cell_set_preferred_alias'] ]
        if pref_alias:
            out.extend(pref_alias)
        else:
            out.append(p)
    return out

for n in out['nodes']:
    if 'is_leaf' in n.keys() and n['is_leaf']:
        path_to_root = []
        ascend(node_id = n['cell_set_accession'], edges=out['edges'], path=path_to_root)
        print(id_conv(path_to_root, out['nodes']))
        



['Inh L1-2 PAX6 CDH12', 'CS201908210_81', 'LAMP5/PAX6', 'ADARB2 (CGE)', 'Inhibitory', 'Neuronal', 'CS201908210_76']
['Inh L1-2 PAX6 TNFAIP8L3', 'CS201908210_81', 'LAMP5/PAX6', 'ADARB2 (CGE)', 'Inhibitory', 'Neuronal', 'CS201908210_76']
['Inh L1 LAMP5 NMBR', 'CS201908210_82', 'LAMP5/PAX6', 'ADARB2 (CGE)', 'Inhibitory', 'Neuronal', 'CS201908210_76']
['Inh L1-4 LAMP5 LCP2', 'CS201908210_84', 'CS201908210_83', 'CS201908210_82', 'LAMP5/PAX6', 'ADARB2 (CGE)', 'Inhibitory', 'Neuronal', 'CS201908210_76']
['Inh L1-2 LAMP5 DBP', 'CS201908210_84', 'CS201908210_83', 'CS201908210_82', 'LAMP5/PAX6', 'ADARB2 (CGE)', 'Inhibitory', 'Neuronal', 'CS201908210_76']
['Inh L2-6 LAMP5 CA1', 'CS201908210_83', 'CS201908210_82', 'LAMP5/PAX6', 'ADARB2 (CGE)', 'Inhibitory', 'Neuronal', 'CS201908210_76']
['Inh L1 SST CHRNA4', 'CS201908210_89', 'CS201908210_88', 'CS201908210_87', 'CS201908210_86', 'VIP', 'ADARB2 (CGE)', 'Inhibitory', 'Neuronal', 'CS201908210_76']
['Inh L1-2 ADARB2 MC4R', 'CS201908210_89', 'CS2019082

In [39]:
## Check for unique attribute values

def unique_attribute_test(nodes):
    # Assuming root node has all keys
    for attr in nodes[0].keys():
        a = {n[attr] for n in out['nodes'] if attr in n.keys()}
        if len(nodes) == len(a):
            print("%s, e.g. '%s' is a unique attribute" % (attr, nodes[1][attr]))
            
unique_attribute_test(out['nodes'])

cell_set_accession, e.g. 'CS201908210_77' is a unique attribute
original_label, e.g. 'n2' is a unique attribute
cell_set_label, e.g. 'MTG 001-069' is a unique attribute
_row, e.g. 'n2' is a unique attribute


In [None]:
# Lets go with cell_set_label as rdfs_label for now.  It may be worth inventing some APs for the others...

# Alternative: 
   # Everything with a cell_set_preferred_alias gets the name "$cell_set_preferred_alias - $taxonomy_id. 
   # We make also classes for these
   # The rest we name "$cell_set_label - $taxonomy_id"
   # other identifiers go in synonyms (at least for now)

In [130]:
robot_template_seed = { 'ID': 'ID', 
                       'Label': 'LABEL',
                       'PrefLabel': 'A skos:prefLabel',
                       'Entity Type': 'TI %',
                       'TYPE': 'TYPE',
                       'Property Assertions': 'I BDSHELP:subcluster_of SPLIT='|'',
                       'Synonyms': 'A oboInOwl:has_exact_synonym',
                      }



In [112]:
import pandas as pd

robot_template = pd.DataFrame.from_records([robot_template_seed])
robot_template

Unnamed: 0,ID,Label,PrefLabel,Entity Type,TYPE,Property Assertions,Synonyms
0,ID,LABEL,A skos:prefLabel,TI %,TYPE,I BDSHELP:subcluster_of,A oio:has_exact_synonym


In [131]:
dl = [robot_template_seed]

synonym_properties = ['original_label', 
                      'cell_set_aligned_alias',
                      'cell_set_additional_aliases']
for o in out['nodes']:
    d = {}
    d['ID'] = 'AllenDend:' + o['cell_set_accession']
    d['TYPE'] = 'owl:NamedIndividual'
    d['Label'] = o['cell_set_label']
    d['PrefLabel'] = o['cell_set_preferred_alias']
    d['Entity Type'] = 'BDSHELP:Cluster'
    d['Synonyms'] = '|'.join([o[prop] for prop in synonym_properties if o[prop]])
    d['Property Assertions'] = '|'.join(['AllenDend:' + e[1] for e in out['edges'] if e[0] == o['cell_set_accession']]) # There should only be one!
    dl.append(d)

robot_template = pd.DataFrame.from_records(dl)
#print(robot_template[['ID','PrefLabel']][0:50])
#print(robot_template[['ID','PrefLabel']][51:100])
#print(robot_template[['ID','PrefLabel']][100:-1])

robot_template[['ID','PrefLabel']]
    

Unnamed: 0,ID,PrefLabel
0,ID,A skos:prefLabel
1,AllenDend:CS201908210_76,
2,AllenDend:CS201908210_77,Neuronal
3,AllenDend:CS201908210_78,Inhibitory
4,AllenDend:CS201908210_79,ADARB2 (CGE)
...,...,...
145,AllenDend:CS201908210_71,Astro L1-6 FGFR3 SLC14A1
146,AllenDend:CS201908210_72,Astro L1-2 FGFR3 GFAP
147,AllenDend:CS201908210_73,Oligo L1-6 OPALIN
148,AllenDend:CS201908210_74,Endo L2-6 NOSTRIN


In [132]:
robot_template.to_csv("CCN201908210.tsv", sep="\t", index=False)

 - [x] Constuct helper files
 - [x] Class: cell cluster (we have one in VFB but should be generalised - maybe cell set?)
 - [x] OP subcluster_of
 - [x] use add-prefix to add curies


In [None]:
robot_class_template_seed = {
                       'ID': 'ID', 
                       'Label': 'LABEL',
                       'PrefLabel': 'A skos:prefLabel',
                       'Classification': 'SC %',
                       'Synonyms': 'A oboInOwl:has_exact_synonym',
                       'part of': "SC 'part of' some %"
                       'located in': "SC 'part of' some %"
                       'has soma location': "SC 'part of' some %"
                       'functions': "SC 'capable of' some % SPLIT='|'",
                      }
