In [1]:
import pandas as pd
human_markers = pd.read_csv("data/human_markers_2021_02_05.tsv", sep = "\t")
mouse_markers = pd.read_csv("data/mouse_markers_2021_02_05.tsv", sep = "\t")

In [2]:
human_markers.describe()

Unnamed: 0,speciesType,tissueType,UberonOntologyID,cancerType,cellType,cellName,CellOntologyID,cellMarker,geneSymbol,geneID,proteinName,proteinID,markerResource,PMID,Company
count,2868,2868,2259,2868,2868,2868,2149,2868,2825,2825,2823,2823,2868,2868,248
unique,1,159,151,131,2,467,230,1682,1593,1596,1595,1595,4,1764,8
top,Human,Undefined,UBERON_0005408,Normal,Normal cell,Cancer stem cell,CL_0000034,CD133,PROM1,8842,PROM1,O43490,Experiment,Company,biolegend
freq,2868,560,198,2117,2117,550,237,104,109,109,109,109,2258,248,51


In [3]:
mouse_markers.describe()

Unnamed: 0,speciesType,tissueType,UberonOntologyID,cancerType,cellType,cellName,CellOntologyID,cellMarker,geneSymbol,geneID,proteinName,proteinID,markerResource,PMID,Company
count,1255,1255,1118,1255,1255,1255,1106,1255,1248,1248,1244,1244,1255,1255,107
unique,1,82,79,30,2,389,197,936,896,895,891,892,4,674,6
top,Mouse,Brain,UBERON_0000955,Normal,Normal cell,Macrophage,CL_0000034,F4/80,Adgre1,13733,AGRE1,Q61549,Experiment,Company,ebioscience
freq,1255,138,138,1198,1198,66,71,22,22,22,22,22,806,107,38


In [4]:
cell_classes = []
for i, row in human_markers.iterrows():
    cell_class = row["cellName"] +  " (" + row["tissueType"] + ")"
    cell_classes.append(cell_class)
    
for i, row in mouse_markers.iterrows():
    cell_class = row["cellName"] +  " (" + row["tissueType"] + ")"
    cell_classes.append(cell_class)    


In [5]:
a = list(set(cell_classes))

In [6]:
len(a)

1542

Game plan: 

Select only normal (non-cancer) cells

Reconcile them manually, cell type by cell type, to "general" cell types.


Merge with reference table of instances of cell types that are not in human or mouse. 

Locate when possible, create when not. 

Select those names that are in the "human" table.

Find subclasses that are found in taxon "Homo sapiens"



In [7]:
human_markers = human_markers[human_markers["cellType"]== "Normal cell"]
mouse_markers = mouse_markers[mouse_markers["cellType"]== "Normal cell"]

In [8]:
reference_cell_types_human = human_markers[["tissueType", "UberonOntologyID", "cellName"]]
reference_cell_types_mouse = mouse_markers[["tissueType", "UberonOntologyID", "cellName"]]

reference_cell_types = reference_cell_types_human.append(reference_cell_types_mouse)

In [9]:
reference_cell_types = reference_cell_types.drop_duplicates()

In [10]:
from wikidata2df import wikidata2df

query = """
SELECT ?item ?itemLabel 
WHERE 
{
  ?item wdt:P31 wd:Q189118.
  MINUS {?item wdt:P703 wd:Q15978631;
               wdt:P703 wd:Q83310.}
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

cell_types_wikidata = wikidata2df(query)

In [11]:
 reference_match = reference_cell_types.merge(cell_types_wikidata, right_on="itemLabel", left_on="cellName", how="outer")
    
# Remove the unmatched Wikidata cell types
reference_match =   reference_match.dropna(subset=["cellName"])

In [12]:
len(reference_match)

1460

Now let us remove the cell types that come from single articles. 

(Those that have "et al" in the names)

In [13]:
reference_match = reference_match \
                [~reference_match["cellName"].
                str.contains("et al",
                             na=False)]

In [14]:
len(reference_match.dropna(subset=["cellName"]))

1392

In [15]:
reference_match.to_csv("data/reference_cell_types_devel.csv", index=False)

The table was uploaded on Google Sheets and  will be manually updated there:

https://docs.google.com/spreadsheets/d/1ysx4SiJYn72XRsDGLZAEXA1Ot9RjL36ghOdlze-m68U/edit#gid=1617575621

The ones that cannot be unambiguously matched to Wikidata will __not__ be manually created. 

Superclasses will be added to a new column ("superclass"). 