In [113]:
import pandas as pd

table_seed = [ {'ID': 'ID', 
                'CLASS_TYPE': 'CLASS_TYPE', 
                'label': 'A rdfs:label', 
                'Genus': 'C %', 
                'neuron_region1': "C 'has soma location' some %",
                'neuron_region2': "C 'has soma location' some %",
                'glia_region1': "C 'part of' some %",
                'glia_region2': "C 'part of' some %",
                'endothelial_region1': "C 'located in' some %",
                'endothelial_region1': "C 'located in' some %",               
                'Marker1': "C expresses some %",
                'Marker2': "C expresses some %",
                'Marker3': "C expresses some %",
                'Marker4': "C expresses some %"
               }]  # add in some part of tags

eq_tab = pd.DataFrame.from_records(table_seed)
sub_tab = pd.DataFrame.from_records(table_seed)
marker_tab = pd.DataFrame.from_records([{'ID': 'ID',
                                         'CLASS_TYPE': 'CLASS_TYPE',
                                         'label': 'A rdfs:label', 
                                         'Genus': 'C %'}])

In [114]:
import numpy as np
mtg = pd.read_csv("./MTG.tsv", sep='\t')
mtg.replace(np.nan, '', regex=True, inplace=True)
mtg.columns


Index(['pCL_id (or CL_id)', 'pCL_name (or CL_name)',
       'transcriptome data cluster', 'TDC_id', 'part_of (uberon_id)',
       'Species_source', 'Species_ID', 'part_of (uberon_name)',
       'has_predicted_soma_location_in', 'is_a (CL or pCL_id)',
       'is_a (CL or pCL_name)', 'cluster_size (number of cells)',
       'marker_gene_evidence', 'f-measure_evidence', 'selectively_expresses.1',
       'selectively_expresses.2', 'selectively_expresses.3',
       'selectively_expresses.4', 'neuron_type'],
      dtype='object')

In [5]:
mtg['is_a (CL or pCL_name)'][0:9]


0    FBXL7-expressing human cerebral cortex MTG GAB...
1    FBXL7-expressing human cerebral cortex MTG GAB...
2    FBXL7-expressing human cerebral cortex MTG GAB...
3    FBXL7-expressing human cerebral cortex MTG GAB...
4    FBXL7-expressing human cerebral cortex MTG GAB...
5    FBXL7-expressing human cerebral cortex MTG GAB...
6    FBXL7-expressing human cerebral cortex MTG GAB...
7    FBXL7-expressing human cerebral cortex MTG GAB...
8    FBXL7-expressing human cerebral cortex MTG GAB...
Name: is_a (CL or pCL_name), dtype: object

### Strategy

#### is_a 
If 'is_a (CL or pCL_name)' contains a CL ID use that.
Otherwise use 'neuron type' column:
GABAergic = CL_
Glutamatergic = CL_
(don't bother with denormalising NT)

#### tissue 
Use 'part_of (uberon_id)' + generic cortex term.
use has_soma_location -> middle temporal gyrus for all neurons
use part_of for all glial cells
use located in for the one endothelial cell

#### Expression
selectively_expresses
TGFBR2|HGNC_11773
parse out id & name -> new class.  Note - in some cases we will have no HGNC.


In [115]:
row1 = mtg.iloc[0]['is_a (CL or pCL_id)']
fu = re.match('CL_(\d+)', mtg.iloc[0]['is_a (CL or pCL_id)'])

df.columns


Index(['CLASS_TYPE', 'Genus', 'ID', 'Marker1', 'Marker2', 'Marker3', 'Marker4',
       'endothelial_region_1', 'glia_region_1', 'glia_region_2', 'label',
       'neuron_region1', 'neuron_region2'],
      dtype='object')

In [142]:
import re
from numpy import nan

def preproc(x):
    ### Strip leading and trailing spaces + convert nan to empty string 
    if x == nan:
        x = ''
    return(x.strip())

def proc_isa(r, typ):
    isa_id = r['is_a (CL or pCL_id)'].strip()
    ia = re.match('CL_(\d+)', isa_id)
    if ia:
        return "CL:" + ia.group(1)
    ### only defaulting to general class for eq if neuron
    elif (typ == 'e') and (r['neuron_type'] == 'GABAergic'):
        return "CL:0010011"
    elif (typ == 'e') and (r['neuron_type'] =='Glutamatergic'):
        return "CL:0000679"
    else:
        return 'http://www.jcvi.org/cl_ext/mtg_cluster/' + isa_id
    
    
def proc_po(isa, r):
    out = {}
    # if neuron
    if isa in ['CL:0010011', 'CL:0000679']:
        out['neuron_region1'] = 'UBERON:0002771'
        out['neuron_region2'] = 'UBERON:0000956' # Cortex
    # elif endothelial        
    elif isa == 'CL:1001602':
        out['endothelial_region1'] = 'UBERON:0002771'
        out['endothelial_region2'] = 'UBERON:0000956'
    else:
        out['glia_region1'] = 'UBERON:0002771'
        out['glia_region2'] = 'UBERON:0000956'
    return out

def proc_marker(m):
    s = m.split('|')
    # Assumes that if split by a '|', the second item will be an HGNC ID !
    if len(s) == 2:
        match = re.match("HGNC_(.+)", s[1])
        if match:
            return { 'iri': "http://identifiers.org/hgnc/HGNC:" + match.group(1), 'name' : s[0] }
    elif m:
        return { 'iri': "http://identifiers.org/genecards/" + m, 'name' : '' }
    else:
        return { 'iri': '', 'name': '' }
     
def proc_markers(r):
    out = []
    markers = [r['selectively_expresses.1'],  r['selectively_expresses.2'], r['selectively_expresses.3']]
    for m in markers:
        out.append(proc_marker(m))
    return out
    

In [144]:
## Tests
print(mtg.columns)
r = mtg.iloc[0]
print(r['pCL_id (or CL_id)'])
print(proc_isa(r, typ='e'))
print(proc_isa(r, typ='s'))


Index(['pCL_id (or CL_id)', 'pCL_name (or CL_name)',
       'transcriptome data cluster', 'TDC_id', 'part_of (uberon_id)',
       'Species_source', 'Species_ID', 'part_of (uberon_name)',
       'has_predicted_soma_location_in', 'is_a (CL or pCL_id)',
       'is_a (CL or pCL_name)', 'cluster_size (number of cells)',
       'marker_gene_evidence', 'f-measure_evidence', 'selectively_expresses.1',
       'selectively_expresses.2', 'selectively_expresses.3',
       'selectively_expresses.4', 'neuron_type'],
      dtype='object')
pCL1
CL:0010011
http://www.jcvi.org/cl_ext/mtg_cluster/pCL78


In [147]:
# Currently uses pd.DataFrame.from_records(list) to populate, 
# but this has the danger that new columns can easily be added
# by accident/typo.  Better to switch to adding directly?

el = []
sl = []
ml = []

def make_tsv(dl, filename):
    robot_template = eq_tab.append(pd.DataFrame.from_records(dl))
    robot_template.replace(np.nan, '', regex=True, inplace=True)
    robot_template.drop_duplicates(inplace=True)
    robot_template.to_csv(filename, sep='\t', index=False)    

for i, r in mtg.iterrows():
    trow = {}
    erow = {}
    srow = {}
    mrow = {}
    erow['CLASS_TYPE'] = 'equivalent'
    srow['CLASS_TYPE'] = 'subclass'
    trow['ID'] = 'http://www.jcvi.org/cl_ext/mtg_cluster/' + r['pCL_id (or CL_id)']
    trow['label'] = r['pCL_name (or CL_name)']
    erow['Genus'] = proc_isa(r, typ = 'e')
    srow['Genus'] = proc_isa(r, typ = 's')
    markers = proc_markers(r)
    i = 1
    for m in markers:
        if m['iri']:
            trow['Marker' + str(i)] = m['iri']
            i += 1
            if m['name']:
                mrow['ID'] = m['iri']
                mrow['label'] = m['name']
    trow.update(proc_po(isa=erow['Genus'], r=r))
    erow.update(trow)
    srow.update(trow)
    el.append(erow)
    sl.append(srow)
    if mrow: ml.append(mrow)
        
    
make_tsv(el, 'mtg_equivalent.tsv')
make_tsv(sl, 'mtg_subclass.tsv')
make_tsv(ml, 'markers.tsv')

    


    

    
    

    

In [136]:
print('"'+robot_template.iloc[73]['Genus']+'"')


"http://www.jcvi.org/cl_ext/mtg_cluster/pCL85"
