# GA4GH Biosample

This notebook demonstrates how to use the oncopacket Python package to create GA4GH Individual messages from Cancer Data Aggregator (CDA) data.
We first extract data about a specimen in a CDA cohort and then use the package to create the GA4GH [Biosample](https://phenopacket-schema.readthedocs.io/en/latest/biosample.html) messages.

In [1]:
from oncoexporter.cda import CdaTableImporter, CdaBiosampleFactory

In [2]:
from cdapython import ( Q, set_default_project_dataset, set_host_url, set_table_version )

set_default_project_dataset("gdc-bq-sample.dev")
set_host_url("http://35.192.60.10:8080/")
set_table_version("all_merged_subjects_v3_2_final")

In [3]:
cohort_name = "cervix cancer cohort"
query = 'treatment_anatomic_site = "Cervix"'
Tsite = Q('treatment_anatomic_site = "Cervix"')
tableImporter = CdaTableImporter(cohort_name=cohort_name, query_obj=Tsite)
specimen_df = tableImporter.get_specimen_df();

Output()

In [4]:
specimen_df.head()

Unnamed: 0,specimen_id,specimen_identifier,specimen_associated_project,days_to_collection,primary_disease_type,anatomical_site,source_material_type,specimen_type,derived_from_specimen,derived_from_subject,subject_id,researchsubject_id
0,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,"[{'system': 'GDC', 'field_name': 'case.samples...",CGCI-HTMCP-CC,,Squamous Cell Neoplasms,,Primary Tumor,aliquot,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,CGCI.HTMCP-03-06-02001,CGCI.HTMCP-03-06-02001,CGCI-HTMCP-CC.HTMCP-03-06-02001
1,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,"[{'system': 'GDC', 'field_name': 'case.samples...",CGCI-HTMCP-CC,,Squamous Cell Neoplasms,,Blood Derived Normal,portion,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,CGCI.HTMCP-03-06-02001,CGCI.HTMCP-03-06-02001,CGCI-HTMCP-CC.HTMCP-03-06-02001
2,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,"[{'system': 'GDC', 'field_name': 'case.samples...",CGCI-HTMCP-CC,,Squamous Cell Neoplasms,,Primary Tumor,aliquot,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,CGCI.HTMCP-03-06-02001,CGCI.HTMCP-03-06-02001,CGCI-HTMCP-CC.HTMCP-03-06-02001
3,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,"[{'system': 'GDC', 'field_name': 'case.samples...",CGCI-HTMCP-CC,,Squamous Cell Neoplasms,,Blood Derived Normal,aliquot,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,CGCI.HTMCP-03-06-02001,CGCI.HTMCP-03-06-02001,CGCI-HTMCP-CC.HTMCP-03-06-02001
4,CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02...,"[{'system': 'GDC', 'field_name': 'case.samples...",CGCI-HTMCP-CC,,Squamous Cell Neoplasms,,Primary Tumor,sample,initial specimen,CGCI.HTMCP-03-06-02001,CGCI.HTMCP-03-06-02001,CGCI-HTMCP-CC.HTMCP-03-06-02001


# Import data from CDA

The CdaBiosampleFactory is still in prototype stage.

In [7]:
biosample_factory = CdaBiosampleFactory()
ga4gh_biosamples = []
for _, row in specimen_df.iterrows():
    ga4gh_biosamples.append(biosample_factory.to_ga4gh(row=row))
print(f"We extracted {len(ga4gh_biosamples)} GA4GH Phenopacket Biosample messages")

In [6]:
from google.protobuf.json_format import MessageToJson
from pprint import pprint
json_string = MessageToJson(ga4gh_biosamples[0])
pprint(json_string)

('{\n'
 '  "id": "CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02001-01A-03R-4427",\n'
 '  "individualId": "CGCI.HTMCP-03-06-02001",\n'
 '  "derivedFromId": '
 '"CGCI-HTMCP-CC.HTMCP-03-06-02001.HTMCP-03-06-02001-01A-03R",\n'
 '  "sampleType": {\n'
 '    "id": "NCIT:C25414",\n'
 '    "label": "Aliquot"\n'
 '  },\n'
 '  "taxonomy": {\n'
 '    "id": "NCBITaxon:9606",\n'
 '    "label": "Homo sapiens"\n'
 '  },\n'
 '  "materialSample": {\n'
 '    "id": "NCIT:C162622",\n'
 '    "label": "Tumor Segment"\n'
 '  }\n'
 '}')
