# Using the HuBMAP Cells API and the HRA API for Spatial Prediction of Proteins [INCOMPLETE]

Find all cells in relevant HuBMAP datasets, then compile a table that can then be used to predict a spatial origin with the HRA API.

# Install libraries

In [1]:
!pip install hra_api_client hubmap_api_py_client

Defaulting to user installation because normal site-packages is not writeable


# Part 1: Get Cell Populations from HuBMAP Datasets using the Cells API

In [2]:
import hra_api_client
import hubmap_api_py_client
from pprint import pprint
from collections import Counter

## Setup the HuBMAP Cells API Client

In [3]:
from hubmap_api_py_client import Client
endpoint_url = "https://cells.api.hubmapconsortium.org/api/"
client = Client(endpoint_url) 

## Find cell types that have annotated datasets

In [11]:
all_proteins = client.select_proteins()
assert len(all_proteins) > 0

proteins = [c["protein_id"] for c in all_proteins.get_list()]
print('proteins:', len(proteins))

proteins: 65


## Find all datasets that have been annotated with cell types

In [16]:
protein_query = [ f"{protein} > 1000" for protein in proteins ]
datasets = client.select_datasets(where='protein', has=protein_query, min_cell_percentage=5.0).get_list()
assert len(datasets) > 0

uuids = [ d['uuid'] for d in datasets ]
print('annotated datasets with cell types:', len(datasets))

annotated datasets with cell types: 5


## Get cells for each annotated dataset

In [22]:
dataset_cells = {}
dataset_organ = {}
dataset_modality = {}

for uuid in uuids:
    cells_in_dataset = client.select_cells(where='dataset', has=[uuid])
    all_cells = cells_in_dataset.get_list().results_set.get_list()

    population = Counter()
    for cell in all_cells:
        population[cell['cell_id']] += 1
        dataset_organ[uuid] = cell['organ'].lower()
        dataset_modality[uuid] = cell['modality']

    dataset_cells[uuid] = population

print(uuids[0], 'top cell types:', dataset_cells[uuids[0]].most_common(5))
print(uuids[0], 'organ:', dataset_organ[uuids[0]])
print(uuids[0], 'modality:', dataset_modality[uuids[0]])

214f19e2921c5d4108d905b9d30da556 top cell types: [('214f19e2921c5d4108d905b9d30da556-reg1-1000', 1), ('214f19e2921c5d4108d905b9d30da556-reg1-10000', 1), ('214f19e2921c5d4108d905b9d30da556-reg1-100000', 1), ('214f19e2921c5d4108d905b9d30da556-reg1-100001', 1), ('214f19e2921c5d4108d905b9d30da556-reg1-100002', 1)]
214f19e2921c5d4108d905b9d30da556 organ: lymph node
214f19e2921c5d4108d905b9d30da556 modality: codex


In [23]:
#https://github.com/hubmapconsortium/hubmap-api-py-client/blob/main/examples/select_celltypes.md

list(all_cells.results_set.get_list())


[{'cell_id': 'c4b216fbc950f8cdda0d261e585a2f3c-reg1-1000',
  'modality': 'codex',
  'dataset': 'c4b216fbc950f8cdda0d261e585a2f3c',
  'organ': 'Spleen',
  'cell_type': 'unknown',
  'clusters': ['KMeans-Mean-c4b216fbc950f8cdda0d261e585a2f3c-reg1-2',
   'KMeans-Covariance-c4b216fbc950f8cdda0d261e585a2f3c-reg1-1',
   'KMeans-Total-c4b216fbc950f8cdda0d261e585a2f3c-reg1-1',
   'KMeans-Mean-All-SubRegions-c4b216fbc950f8cdda0d261e585a2f3c-reg1-1',
   'KMeans-Shape-Vectors-c4b216fbc950f8cdda0d261e585a2f3c-reg1-2',
   'KMeans-Texture-c4b216fbc950f8cdda0d261e585a2f3c-reg1-1',
   'KMeans-tSNE_All_Features-c4b216fbc950f8cdda0d261e585a2f3c-reg1-6']},
 {'cell_id': 'c4b216fbc950f8cdda0d261e585a2f3c-reg1-10000',
  'modality': 'codex',
  'dataset': 'c4b216fbc950f8cdda0d261e585a2f3c',
  'organ': 'Spleen',
  'cell_type': 'unknown',
  'clusters': ['KMeans-Mean-c4b216fbc950f8cdda0d261e585a2f3c-reg1-3',
   'KMeans-Covariance-c4b216fbc950f8cdda0d261e585a2f3c-reg1-1',
   'KMeans-Total-c4b216fbc950f8cdda0d261e5

# Part 2: Predict Spatial Locations Using the HRA API

## Setup the HRA API Client

In [7]:
import hra_api_client
from hra_api_client.api import v1_api, hra_pop_api

hra_api_endpoint_url = "https://apps.humanatlas.io/api"
configuration = hra_api_client.Configuration(hra_api_endpoint_url)
api_client = hra_api_client.ApiClient(configuration)
hra_api = v1_api.V1Api(api_client)
hra_pop_api = hra_pop_api.HraPopApi(api_client)

## Get supported organ lookup

In [8]:
organ_lookup = dict((organ.label.lower(), organ.id) for organ in hra_pop_api.supported_organs())
organ_lookup

{'small intestine': 'http://purl.obolibrary.org/obo/UBERON_0002108',
 'heart': 'http://purl.obolibrary.org/obo/UBERON_0000948',
 'kidney': 'http://purl.obolibrary.org/obo/UBERON_0002113',
 'large intestine': 'http://purl.obolibrary.org/obo/UBERON_0000059',
 'liver': 'http://purl.obolibrary.org/obo/UBERON_0002107',
 'lung': 'http://purl.obolibrary.org/obo/UBERON_0002048',
 'skin': 'http://purl.obolibrary.org/obo/UBERON_0002097',
 'skin of body': 'http://purl.obolibrary.org/obo/UBERON_0002097',
 'spleen': 'http://purl.obolibrary.org/obo/UBERON_0002106',
 'ureter': 'http://purl.obolibrary.org/obo/UBERON_0000056',
 'urinary bladder': 'http://purl.obolibrary.org/obo/UBERON_0001255'}

## Get HRApop cell summary reports for each dataset

In [9]:
dataset_summary = {}

for uuid in uuids:
    organ_iri = organ_lookup[dataset_organ[uuid]]
    total_count = sum(dataset_cells[uuid].values())
    csv = "\n".join(["cell_id,percentage"] + [ f"{cell_id},{count / total_count}" for (cell_id,count) in dataset_cells[uuid].items() ])
    request = { "organ": organ_iri, "csvString": csv }
    print(uuid)
    dataset_summary[uuid] = hra_pop_api.cell_summary_report(request)

print(uuids[0], 'has similar sources:', len(dataset_summary[uuids[0]]['sources']))
print(uuids[0], 'has similar RUI locations:', len(dataset_summary[uuids[0]]['rui_locations']))

007f3dfaaa287d5c7c227651f61a9c5b
018a905cdbdff684760859f594d3fd77
0b6f63f2bd61a8c091fc7afc0f318ad1
149e1be22a4961f9c6240480710836a3
173de2e80adf6a73ac8cff5ccce20dfc
17f67cb15e59f65e640d85d2f3866cde
1dc16eb0270ff73291dd45b6a96aa3c0
1ea6c0ac5ba60fe35bf63af8699b6fbe
224e01ccfc20977ee5a6a6a5b96aa7d7
22684b9011fc5aea5cb3f89670a461e8
33b9c54d7c295897826e1e5271d4fc24
35e9bf0f58152ceb6f4d01052ac8867d
367fee3b40cba682063289505b922be1
37988db44acc8d0780e4e31cd057e789
3c1b10bc912c60c9afc36b7423695236
47015df8ab7615a9ce26098d88196dde
4b62d9d2c248323ce029859f953fdc57
524dc341a03c155b6f4140e9d72f9b1d
56cbda4789f04d79c0c3dffe21816d48
5a5ca03fa623602d9a859224aa40ace4
5ee240959c96b49d960702755478b9fc
62efbe0a6abd0bcf53ab9ab29e7cd73f
63349325056ccff582f1d095055c7e12
6e1db473492095ccc2f1393d7259b9c0
7c9e07c96d144536525b1f889acee14d
7d70a4deb9e2f0592f0c56e20ca91169
898138b7f45a67c574e9955fb400e9be
8cdb42ed1194255c74c8462b99bbd7ef
8d631eee88855ac59155edca2a3bc1ca
9a7e6be288b27ddbd3366c4ae41bbcd2
a48ab0bf5d

In [23]:
similar_as = list(filter(lambda s: s['cell_source_type'] == "http://purl.org/ccf/AnatomicalStructure", dataset_summary[uuids[0]]['sources']))

as_labels = list(sorted(set([ s['cell_source_label'].lower() for s in similar_as if s['similarity'] > 0.66 ])))
print(uuids[0], 'anatomical structures with similar cell populations:\n', '\n '.join(as_labels))

007f3dfaaa287d5c7c227651f61a9c5b anatomical structures with similar cell populations:
 heart left ventricle
 left cardiac atrium
 right cardiac atrium
