In [1]:
import pandas as pd
import re
import os

input_file= "./MTG_pCLv4.0_full_logical_def.xlsx"
dataset_name = "m1"

table_seed = [ {'ID': 'ID', 
                'CLASS_TYPE': 'CLASS_TYPE', 
                'label': 'A rdfs:label', 
                'Genus': 'C %', 
                'neuron_region1': "C 'has soma location' some %",
                'neuron_region2': "C 'has soma location' some %",
                'glia_region1': "C 'part of' some %",
                'glia_region2': "C 'part of' some %",
                'endothelial_region1': "C 'located in' some %",
                'endothelial_region1': "C 'located in' some %",               
                'Marker1': "C expresses some %",
                'Marker2': "C expresses some %",
                'Marker3': "C expresses some %",
                'Marker4': "C expresses some %",
                'Marker5': "C expresses some %",
                'Marker6': "C expresses some %",
                'Marker7': "C expresses some %",
                'Marker8': "C expresses some %"
               }]  # add in some part of tags

#eq_tab = pd.DataFrame.from_records(table_seed)
#sub_tab = pd.DataFrame.from_records(table_seed)
out_tab = pd.DataFrame.from_records(table_seed)
if os.path.isfile("./markers.tsv"):
    marker_tab = pd.read_csv("./markers.tsv", sep="\t")
else:
    marker_tab = pd.DataFrame.from_records([{'ID': 'ID',
                                         'CLASS_TYPE': 'CLASS_TYPE',
                                         'label': 'A rdfs:label', 
                                         'Genus': 'C %'}])

import numpy as np
#mtg = pd.read_excel("./nsf2_full_mtg_ver3_6_Nature_table.xlsx")
in_tab = pd.read_excel(input_file)
in_tab.replace(np.nan, '', regex=True, inplace=True)
in_tab.columns

Index(['pCL_id (or CL_id)', 'pCL_name (or CL_name)',
       'transcriptome data cluster', 'TDC_id', 'Species_source', 'Species_ID',
       'part_of (uberon_id)', 'part_of (uberon_name)', 'enriched_in',
       'has_soma_location_in', 'is_a (CL or pCL_id)', 'is_a (CL or pCL_name)',
       'cluster_size (number of cells)', 'marker_gene_evidence',
       'f-measure_evidence', 'selectively_expresses',
       'selectively_expresses.1', 'selectively_expresses.2',
       'selectively_expresses.3', 'selectively_expresses.4',
       'selectively_expresses.5', 'selectively_expresses.6',
       'selectively_expresses.7', 'selectively_expresses.8',
       'selectively_expresses.9', 'neuron_type'],
      dtype='object')

### Strategy

#### is_a 
If 'is_a (CL or pCL_name)' contains a CL ID use that.
Otherwise use 'neuron type' column:
GABAergic = CL_
Glutamatergic = CL_
(don't bother with denormalising NT)

#### tissue 
Use 'part_of (uberon_id)' + generic cortex term.
use has_soma_location -> middle temporal gyrus for all neurons
use part_of for all glial cells
use located in for the one endothelial cell

#### Expression
selectively_expresses
TGFBR2|HGNC_11773
parse out id & name -> new class.  Note - in some cases we will have no HGNC.


In [2]:
import re
from numpy import nan

def preproc(x):
    ### Strip leading and trailing spaces + convert nan to empty string 
    if x == nan:
        x = ''
    return(x.strip())

def proc_isa(r, typ):
    isa_id = r['is_a (CL or pCL_id)'].strip()
    ia = re.match('CL_(\d+)', isa_id)
    if ia:
        return "CL:" + ia.group(1)
    ### only defaulting to general class for eq if neuron
    elif (typ == 'e') and (r['neuron_type'] == 'GABAergic'):
        return "CL:0011005"
    elif (typ == 'e') and (r['neuron_type'] =='Glutamatergic'):
        return "CL:0000679"
    else:
        return 'http://www.jcvi.org/cl_ext/mtg_cluster/' + isa_id
    
    
def proc_po(isa, r):
    out = {}
    # if neuron
    if isa in ['CL:0011005', 'CL:0000679']:
        out['neuron_region1'] = re.sub('_', ':', r['part_of (uberon_id)'])
        out['neuron_region2'] = 'UBERON:0000956' # Cortex
    # elif endothelial        
    elif isa == 'CL:1001602':
        out['endothelial_region1'] = re.sub('_', ':', r['part_of (uberon_id)'])
        out['endothelial_region2'] = 'UBERON:0000956'
    else:
        out['glia_region1'] = re.sub('_', ':', r['part_of (uberon_id)'])
        out['glia_region2'] = 'UBERON:0000956'
    return out

def proc_marker(m):
    s = m.split('|')
    # Assumes that if split by a '|', the second item will be an HGNC ID !
    if len(s) == 2:
        match = re.match("HGNC_(.+)", s[1])
        if match:
            return { 'iri': "http://identifiers.org/hgnc/HGNC:" + match.group(1), 'name' : s[0] }
    elif m:
        return { 'iri': "http://identifiers.org/genecards/" + m, 'name' : '' }
    else:
        return { 'iri': '', 'name': '' }
     
def proc_markers(r):
    out = []
    marker_rows = [mr for mr in r.keys() if re.match('selectively_expresses.\d+', mr)]
    markers = [r[mr] for mr in marker_rows]
    for m in markers:
        out.append(proc_marker(m))
    return out
    

In [3]:
## Tests
print(in_tab.columns)
r = in_tab.iloc[0]
print(r['pCL_id (or CL_id)'])
print(proc_isa(r, typ='e'))
print(proc_isa(r, typ='s'))


Index(['pCL_id (or CL_id)', 'pCL_name (or CL_name)',
       'transcriptome data cluster', 'TDC_id', 'Species_source', 'Species_ID',
       'part_of (uberon_id)', 'part_of (uberon_name)', 'enriched_in',
       'has_soma_location_in', 'is_a (CL or pCL_id)', 'is_a (CL or pCL_name)',
       'cluster_size (number of cells)', 'marker_gene_evidence',
       'f-measure_evidence', 'selectively_expresses',
       'selectively_expresses.1', 'selectively_expresses.2',
       'selectively_expresses.3', 'selectively_expresses.4',
       'selectively_expresses.5', 'selectively_expresses.6',
       'selectively_expresses.7', 'selectively_expresses.8',
       'selectively_expresses.9', 'neuron_type'],
      dtype='object')
pCL1
CL:0011005
http://www.jcvi.org/cl_ext/mtg_cluster/pCL78


In [4]:
# Currently uses pd.DataFrame.from_records(list) to populate, 
# but this has the danger that new columns can easily be added
# by accident/typo.  Better to switch to adding directly?

el = []
sl = []
ml = []

def make_tsv(dl, seed, filename):
    robot_template = seed.append(pd.DataFrame.from_records(dl))
    robot_template.replace(np.nan, '', regex=True, inplace=True)
    robot_template.drop_duplicates(inplace=True)
    robot_template.to_csv(filename, sep='\t', index=False)   

### New version - terms are specified in two rows: one as subclassof + annotations, a second as eq. 
### Just spec merge of el and srow?


for i, r in in_tab.iterrows():
    trow = {}
    erow = {}
    srow = {}
    erow['CLASS_TYPE'] = 'equivalent'
    srow['CLASS_TYPE'] = 'subclass'
    srow['ID'] = 'http://www.jcvi.org/cl_ext/mtg_cluster/' + r['pCL_id (or CL_id)']
    erow['ID'] = 'http://www.jcvi.org/cl_ext/mtg_cluster/' + r['pCL_id (or CL_id)']
    erow['label'] = r['pCL_name (or CL_name)']
    srow['label'] = r['pCL_name (or CL_name)']
    erow['Genus'] = proc_isa(r, typ = 'e')
    srow['Genus'] = proc_isa(r, typ = 's')
    markers = proc_markers(r)
    i = 1
    for m in markers:
        mrow = {}
        if m['iri']:
            erow['Marker' + str(i)] = m['iri']
            mrow['Genus'] = 'SO:0000704'
            mrow['ID'] = m['iri']
            i += 1
            if m['name']:
                mrow['label'] = m['name']
        if mrow: ml.append(mrow)
            
    erow.update(proc_po(isa=erow['Genus'], r=r))
    el.append(erow)
    sl.append(srow)

    

make_tsv(el, out_tab, dataset_name + '.tsv')
make_tsv(sl, out_tab, dataset_name + 's.tsv')
make_tsv(ml, marker_tab, 'markers.tsv')

    


    

    
    

    

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
