# GA4GH GenomicInterpretation

This notebook demonstrates how to use the oncopacket Python package to create GA4GH GenomicInterpretation messages from Cancer Data Aggregator (CDA) data.
We first extract data about a specimen in a CDA cohort and then use the package to create the GA4GH [Biosample](https://phenopacket-schema.readthedocs.io/en/latest/biosample.html) messages.

In [1]:
from oncoexporter.cda import CdaTableImporter, CdaMutationFactory

In [2]:
from cdapython import ( Q, set_default_project_dataset, set_host_url, set_table_version )

set_default_project_dataset("gdc-bq-sample.dev")
set_host_url("http://35.192.60.10:8080/")
set_table_version("all_merged_subjects_v3_2_final")

In [11]:
cohort_name = "lung cancer cohort"
Dsite = Q('primary_diagnosis_site = "%uter%" OR primary_diagnosis_site = "%cerv%"', )
tableImporter = CdaTableImporter(cohort_name=cohort_name, query_obj=Dsite)
mutation_df = tableImporter.get_mutation_df();

Output()

In [10]:
mutation_df.head()


Unnamed: 0,project_short_name,case_barcode,cda_subject_id,primary_site,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,...,callers,file_gdc_id,muse,mutect2,pindel,varscan2,sample_barcode_tumor,sample_barcode_normal,aliquot_barcode_tumor,aliquot_barcode_normal
0,TCGA-UCEC,TCGA-A5-A2K4,TCGA.TCGA-A5-A2K4,Corpus uteri,PIK3CA,5290,WUGSC,GRCh38,chr3,179234297,...,muse;mutect2;varscan2,2a008593-96ff-4705-92a5-136bbd0b6f36,Yes,Yes,No,Yes,TCGA-A5-A2K4-01A,TCGA-A5-A2K4-10B,TCGA-A5-A2K4-01A-11D-A18P-09,TCGA-A5-A2K4-10B-01D-A18P-09
1,TCGA-CESC,TCGA-C5-A1MI,TCGA.TCGA-C5-A1MI,Cervix uteri,IGSF9B,22997,BI,GRCh38,chr11,133921225,...,muse;mutect2;varscan2,3fd5afe7-9e69-4ea8-ab01-80e41783d795,Yes,Yes,No,Yes,TCGA-C5-A1MI-01A,TCGA-C5-A1MI-10A,TCGA-C5-A1MI-01A-11D-A14W-08,TCGA-C5-A1MI-10A-01D-A14W-08
2,TCGA-CESC,TCGA-EA-A3HQ,TCGA.TCGA-EA-A3HQ,Cervix uteri,ADGRF4,221393,WUGSC,GRCh38,chr6,47714362,...,muse;mutect2;varscan2,b79ead83-dd2c-4ab4-b2b0-21c187904226,Yes,Yes,No,Yes,TCGA-EA-A3HQ-01A,TCGA-EA-A3HQ-10A,TCGA-EA-A3HQ-01A-11D-A20U-09,TCGA-EA-A3HQ-10A-01D-A20U-09
3,TCGA-UCEC,TCGA-AX-A2H5,TCGA.TCGA-AX-A2H5,Corpus uteri,RSU1,6251,WUGSC,GRCh38,chr10,16752565,...,muse;mutect2;varscan2,d7f2b400-8f86-44d9-973b-ef613628d051,Yes,Yes,No,Yes,TCGA-AX-A2H5-01A,TCGA-AX-A2H5-11A,TCGA-AX-A2H5-01A-11D-A17D-09,TCGA-AX-A2H5-11A-11D-A17D-09
4,TCGA-UCEC,TCGA-EO-A3AS,TCGA.TCGA-EO-A3AS,Corpus uteri,ACAD10,80724,WUGSC,GRCh38,chr12,111756386,...,muse;mutect2;varscan2,2774b55c-2f7e-4d80-a53f-bf76c76fe0c2,Yes,Yes,No,Yes,TCGA-EO-A3AS-01A,TCGA-EO-A3AS-10A,TCGA-EO-A3AS-01A-11D-A19Y-09,TCGA-EO-A3AS-10A-01D-A19Y-09


In [47]:
df = mutation_df[1:10]
df.to_csv("mutation_excerpt.tsv", sep="\t")

In [44]:
columns_of_interest = ["cda_subject_id", "primary_site", "Entrez_Gene_Id", "NCBI_Build", "Chromosome", "Start_Position",
                       "End_Position", "Strand", "Variant_Classification", "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
                       #"Verification_Status", "Validation_Status", "Mutation_Status",
                       "SYMBOL", "SYMBOL_SOURCE", "HGNC_ID",
                       "HGVSc", "HGVSp_Short"]
mutation_df[columns_of_interest].head()

Unnamed: 0,cda_subject_id,primary_site,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,SYMBOL,SYMBOL_SOURCE,HGNC_ID,HGVSc,HGVSp_Short
0,TCGA.TCGA-A5-A2K4,Corpus uteri,5290,GRCh38,chr3,179234297,179234297,+,Missense_Mutation,SNP,A,A,T,PIK3CA,HGNC,HGNC:8975,c.3140A>T,p.H1047L
1,TCGA.TCGA-C5-A1MI,Cervix uteri,22997,GRCh38,chr11,133921225,133921225,+,Missense_Mutation,SNP,C,C,T,IGSF9B,HGNC,HGNC:32326,c.2500G>A,p.V834M
2,TCGA.TCGA-EA-A3HQ,Cervix uteri,221393,GRCh38,chr6,47714362,47714362,+,Nonsense_Mutation,SNP,G,G,T,ADGRF4,HGNC,HGNC:19011,c.1117G>T,p.E373*
3,TCGA.TCGA-AX-A2H5,Corpus uteri,6251,GRCh38,chr10,16752565,16752565,+,Missense_Mutation,SNP,C,C,T,RSU1,HGNC,HGNC:10464,c.572G>A,p.R191H
4,TCGA.TCGA-EO-A3AS,Corpus uteri,80724,GRCh38,chr12,111756386,111756386,+,Silent,SNP,C,C,T,ACAD10,HGNC,HGNC:21597,c.3093C>T,p.T1031=


In [37]:
cn = list([mutation_df.columns])
print("--".join(mutation_df.columns))

In [5]:
mutation_factory = CdaMutationFactory()
ga4gh_genomic_interpretations = []
for _, row in mutation_df.iterrows():
    ga4gh_genomic_interpretations.append(mutation_factory.to_ga4gh(row=row))
print(f"We extracted {len(ga4gh_genomic_interpretations)} GA4GH Phenopacket Biosample messages")