Cell types were added to Wikidata in a previous effort. 

Gameplan:

* Extract LOCS and HGNCS from file
* Get gene symbols on Wikidata 
* Map cell types to markers
* Add markers with Quickstatements (not that many markers)

In [1]:
import pandas as pd

markers = pd.read_excel("markers_from_paper.xlsx")

In [2]:
cell_types = pd.read_csv("cell_types_reconciled.csv")

In [3]:
markers.columns

Index(['pCL_id (or CL_id)', 'pCL_name (or CL_name)',
       'transcriptome_data_cluster', 'TDC_id', 'Species_source', 'Species_ID',
       'part_of (uberon_id)', 'part_of (uberon_name)',
       'has_predicted_soma_location_in', 'is_a (CL or pCL_id)',
       'is_a (CL or pCL_name)', 'cluster_size (number of nuclei)',
       'marker_gene_evidence', 'f-measure_evidence', 'selectively_expresses',
       'selectively_expresses.1', 'selectively_expresses.2',
       'selectively_expresses.3', 'neuron_type'],
      dtype='object')

In [4]:
markers = markers[['pCL_name (or CL_name)', 'selectively_expresses',
       'selectively_expresses.1', 'selectively_expresses.2',
       'selectively_expresses.3']]

markers = pd.melt(markers, id_vars='pCL_name (or CL_name)').drop(["variable"], axis=1).dropna()                

In [5]:
import re

markers["value"] = [re.sub(".*\|", "", value) for value in markers["value"]]

In [6]:
markers

Unnamed: 0,pCL_name (or CL_name),value
0,TGFBR2-expressing human cerebral cortex MTG GA...,HGNC_11773
1,SP8-expressing human cerebral cortex MTG GABAe...,HGNC_19196
2,NDNF-expressing human cerebral cortex MTG GABA...,HGNC_26256
3,KIT-expressing human cerebral cortex MTG GABAe...,HGNC_6342
4,CPLX3-expressing human cerebral cortex MTG GAB...,HGNC_27652
...,...,...
260,NR4A2-expressing human cerebral cortex MTG Glu...,HGNC_3446
261,SLC15A5-expressing human cerebral cortex MTG G...,HGNC_23503
327,EYS-expressing human cerebral cortex MTG GABAe...,HGNC_7603
343,CUX2|NTNG-expressing human cerebral cortex MTG...,LOC101928196


In [7]:
import wikidata2df

In [8]:
query = """
SELECT ?item ?itemLabel 
(concat("HGNC_",?hgnc_id) as ?hgnc_ids)
(concat("LOC",?entrez_id) as ?entrez_ids)
WHERE 
{
  ?item wdt:P31 wd:Q7187.
  ?item wdt:P703 wd:Q15978631.
  OPTIONAL {?item wdt:P354 ?hgnc_id.}
  OPTIONAL {?item wdt:P351 ?entrez_id.}
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

gene_df  = wikidata2df.wikidata2df(query)

In [9]:
gene_dict = {}
for i,row in gene_df.iterrows():
    gene_dict[row["entrez_ids"]] = row["item"]
    gene_dict[row["hgnc_ids"]] = row["item"]
    
# Manually added to Wikidata:

In [15]:

gene_dict.update({"LOC105376457": "Q106480946",
'LOC101928964': "Q106480994",
'LOC105375415': "Q106480995",
'LOC105371331': "Q106480996",
'LOC105379146': "Q106480997",
"LOC102724957": "Q20750070"})

In [16]:
cell_df = pd.read_csv("cell_types_reconciled.csv")
cell_dict = {}
for i, row in cell_df.iterrows():
    cell_dict[row["name"]] = row["qid"]
    

In [18]:

with open("not_found.txt", "w") as f:
    for i, row in markers.iterrows():
        try:
            cell = cell_dict[row['pCL_name (or CL_name)']]
            gene = gene_dict[row['value']]  
            print(f"{cell}|P8872|{gene}|S248|Q71306466")
        except:
            f.write(f"{row['value']}'\n'")


Q71314919|P8872|Q14911585|S248|Q71306466
Q71314923|P8872|Q20970083|S248|Q71306466
Q71314925|P8872|Q18046211|S248|Q71306466
Q71314927|P8872|Q20969938|S248|Q71306466
Q71314930|P8872|Q18059265|S248|Q71306466
Q71314934|P8872|Q17917387|S248|Q71306466
Q71314938|P8872|Q18052515|S248|Q71306466
Q71314941|P8872|Q18052421|S248|Q71306466
Q71314943|P8872|Q18047988|S248|Q71306466
Q71314945|P8872|Q21163314|S248|Q71306466
Q71314947|P8872|Q18248676|S248|Q71306466
Q71314951|P8872|Q20777789|S248|Q71306466
Q71314953|P8872|Q18049278|S248|Q71306466
Q71314959|P8872|Q18048721|S248|Q71306466
Q71315558|P8872|Q18030390|S248|Q71306466
Q71314964|P8872|Q18035833|S248|Q71306466
Q71314968|P8872|Q14912176|S248|Q71306466
Q71315560|P8872|Q18031815|S248|Q71306466
Q71315565|P8872|Q18031815|S248|Q71306466
Q71314976|P8872|Q20787714|S248|Q71306466
Q71314979|P8872|Q18027437|S248|Q71306466
Q71314981|P8872|Q18050048|S248|Q71306466
Q71314983|P8872|Q18033470|S248|Q71306466
Q71314986|P8872|Q18059454|S248|Q71306466
Q71314988|P8872|