In [1]:
from cdapython import Q, unique_terms, columns, query

<font color=blue>Test Query 1 Find data from all Subjects who have been treated with “Radiation Therapy, NOS” and have both genomic and proteomic data.</font>

<font color=green> check to see that ResearchSubject.Diagnosis.Treatment.treatment_type is the right place to search</font>

In [3]:
unique_terms("ResearchSubject.Diagnosis.Treatment.treatment_type")

['Radiation Therapy, NOS',
 'Chemotherapy',
 'Surgery',
 'Targeted Molecular Therapy',
 'Brachytherapy, Low Dose',
 'Radiation, External Beam',
 'Stem Cell Transplantation, Allogeneic',
 'Brachytherapy, High Dose',
 'Hormone Therapy',
 'Stereotactic Radiosurgery',
 'Radiation, Stereotactic/Gamma Knife/SRS',
 'Ablation, Radiofrequency',
 'Radiation, 2D Conventional',
 'Pharmaceutical Therapy, NOS',
 'Immunotherapy (Including Vaccines)',
 None,
 'Stem Cell Transplantation, Autologous',
 'Radiation, 3D Conformal',
 'Radiation, Proton Beam',
 'Radiation, Intensity-Modulated Radiotherapy']

In [4]:
treatment = Q('ResearchSubject.Diagnosis.Treatment.treatment_type = "Radiation Therapy, NOS"')
r = treatment.run()
print(r)

Getting results from database

Total execution time: 9770 ms

QueryID: e745486e-41ba-4ff1-8389-1c8d6418e7a0
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis, UNNEST(_Diagnosis.Treatment) AS _Treatment WHERE (_Treatment.treatment_type = 'Radiation Therapy, NOS')
Offset: 0
Count: 100
Total Row Count: 11792
More pages: True



<font color=green> So far, so good, 11792 cases</font>

<font color=green>The ability to select GDC or PDC exists in multiple places but since we're looking for data, ResearchSubject.File.identifier.system seems appropriate</font>

In [5]:
gdc = Q('ResearchSubject.File.identifier.system = "GDC"')
pdc = Q('ResearchSubject.File.identifier.system = "PDC"')

<font color=green> Guessing that a FROM query is needed

In [6]:
q = treatment.From(gdc.And(pdc))
r = q.run()
print(r)

Getting results from database

Total execution time: 992 ms

QueryID: 3e510407-2d2a-4299-a850-09222269c56b
Query: SELECT all_v2.* FROM (SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.File) AS _File, UNNEST(_File.identifier) AS _identifier WHERE ((_identifier.system = 'GDC') AND (_identifier.system = 'PDC'))) AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis, UNNEST(_Diagnosis.Treatment) AS _Treatment WHERE (_Treatment.treatment_type = 'Radiation Therapy, NOS')
Offset: 0
Count: 0
Total Row Count: 0
More pages: False



<font color=green> Well THAT didn't work.  Maybe an AND?</font>

In [7]:
q = treatment.And(gdc.And(pdc))
r = q.run()
print(r)

Getting results from database

Total execution time: 884 ms

QueryID: 44f20a56-4e17-42e7-af17-66ec661a3020
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis, UNNEST(_Diagnosis.Treatment) AS _Treatment, UNNEST(_ResearchSubject.File) AS _File, UNNEST(_File.identifier) AS _identifier WHERE ((_Treatment.treatment_type = 'Radiation Therapy, NOS') AND ((_identifier.system = 'GDC') AND (_identifier.system = 'PDC')))
Offset: 0
Count: 0
Total Row Count: 0
More pages: False



<font color=green> Huh.  Time for a breakdown</font>

In [8]:
q_gdc = treatment.And(gdc)
r_gdc = q_gdc.run()
print(r_gdc)

Getting results from database



ERROR:root:
            Http Status: 500
            Error Message: Response too large to return. Consider specifying a destination table in your job configuration. For more details, see https://cloud.google.com/bigquery/troubleshooting-errors
            


Total execution time: 22089 ms
None


In [9]:
q_pdc = treatment.And(pdc)
r_pdc = q_pdc.run()
print(r_pdc)

Getting results from database

Total execution time: 728 ms

QueryID: bb44f965-95ca-46b6-81b3-c8859b99aef8
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis, UNNEST(_Diagnosis.Treatment) AS _Treatment, UNNEST(_ResearchSubject.File) AS _File, UNNEST(_File.identifier) AS _identifier WHERE ((_Treatment.treatment_type = 'Radiation Therapy, NOS') AND (_identifier.system = 'PDC'))
Offset: 0
Count: 0
Total Row Count: 0
More pages: False



<font color = green> Is File the wrong hierarchy?  Maybe use ResearchSubject instead?</font>

In [10]:
gdc_rs = Q('ResearchSubject.identifier.system = "GDC"')
pdc_rs = Q('ResearchSubject.identifier.system = "PDC"')

In [11]:
q_pdc = treatment.And(pdc_rs)
r = q_pdc.run()
print(r)

Getting results from database

Total execution time: 794 ms

QueryID: 385a8517-d341-4bf3-89c1-2ee3c53fff9a
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis, UNNEST(_Diagnosis.Treatment) AS _Treatment, UNNEST(_ResearchSubject.identifier) AS _identifier WHERE ((_Treatment.treatment_type = 'Radiation Therapy, NOS') AND (_identifier.system = 'PDC'))
Offset: 0
Count: 0
Total Row Count: 0
More pages: False



<font color=green> So this is a fundamental problem, there are 0 cases in PDC having received the treatment (or PDC doesn't support treatment.</font>

In [12]:
q = treatment.And(gdc_rs)
r = q.run()
print(r)

Getting results from database

Total execution time: 28354 ms

QueryID: 7882a0f6-7e5b-4b8b-9dfe-051f2ec4cbc2
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis, UNNEST(_Diagnosis.Treatment) AS _Treatment, UNNEST(_ResearchSubject.identifier) AS _identifier WHERE ((_Treatment.treatment_type = 'Radiation Therapy, NOS') AND (_identifier.system = 'GDC'))
Offset: 0
Count: 100
Total Row Count: 11792
More pages: True



<font color = green> So the starting set needs to be cases from GDC that have had the treatment.</font>

In [18]:
q = pdc_rs.From(treatment.And(gdc_rs))
r = q.run()
print(r)

Getting results from database

Total execution time: 29215 ms

QueryID: c39ad931-a06a-4389-a799-114c5c6ad98f
Query: SELECT all_v2.* FROM (SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis, UNNEST(_Diagnosis.Treatment) AS _Treatment, UNNEST(_ResearchSubject.identifier) AS _identifier WHERE ((_Treatment.treatment_type = 'Radiation Therapy, NOS') AND (_identifier.system = 'GDC'))) AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.identifier) AS _identifier WHERE (_identifier.system = 'PDC')
Offset: 0
Count: 100
Total Row Count: 369
More pages: True



<font color = blue> Test 2 Find data from TCGA-BRCA project, with donors over the age of 50 with imaging data</font>

<font color=green> Step 1, find out how many cases over 50 in TCGA-BRCA</font>

In [2]:
project = Q('ResearchSubject.associated_project = "TCGA-BRCA"')
age = Q('ResearchSubject.Diagnosis.age_at_diagnosis > -50*365')

In [14]:
pr = project.run()
print(pr)

Getting results from database

Total execution time: 23835 ms

QueryID: e952665e-d37d-43f5-b30b-82099c28d0cb
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject WHERE (_ResearchSubject.associated_project = 'TCGA-BRCA')
Offset: 0
Count: 100
Total Row Count: 1098
More pages: True



In [19]:
ar = age.run()
print(ar)

Getting results from database

Total execution time: 24905 ms

QueryID: cb5d6468-deaf-4452-b14c-e480004eb50a
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis WHERE (_Diagnosis.age_at_diagnosis > -50*365)
Offset: 0
Count: 100
Total Row Count: 40545
More pages: True



In [20]:
q = project.And(age)
r = q.run()
print(r)

Getting results from database

Total execution time: 28314 ms

QueryID: 011ca250-fe22-4207-a613-4fbfe6304f91
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject, UNNEST(_ResearchSubject.Diagnosis) AS _Diagnosis WHERE ((_ResearchSubject.associated_project = 'TCGA-BRCA') AND (_Diagnosis.age_at_diagnosis > -50*365))
Offset: 0
Count: 100
Total Row Count: 1082
More pages: True



<font color=green> So there ar 1082 cases over 50.  However, we know that IDC doesn't have ResearchSubjects, so need a different approach</font>

In [3]:
project = Q('ResearchSubject.associated_project = "TCGA-BRCA"')
age = Q('days_to_birth > -50*365')

In [31]:
pr = project.run()
print(pr)

Getting results from database

Total execution time: 10547 ms

QueryID: bf33d2f4-06b0-46a9-8927-76b67dc2944c
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject WHERE (_ResearchSubject.associated_project = 'TCGA-BRCA')
Offset: 0
Count: 100
Total Row Count: 1098
More pages: True



In [32]:
ar = age.run()
print(ar)

Getting results from database

Total execution time: 9205 ms

QueryID: 0cf1f5f9-bce1-4c94-ad27-eb9b9dd2778b
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2 WHERE (all_v2.days_to_birth > -50*365)
Offset: 0
Count: 100
Total Row Count: 12682
More pages: True



In [4]:
idc = Q('identifier.system = "IDC"')
ir = idc.run()
print(ir)

Getting results from database

Total execution time: 14399 ms

QueryID: 2ba4dbae-a002-4eeb-a56d-2f6c699530f9
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(identifier) AS _identifier WHERE (_identifier.system = 'IDC')
Offset: 0
Count: 100
Total Row Count: 43428
More pages: True



In [33]:
q = project.And(age)
r = q.run()
print(r)

Getting results from database

Total execution time: 19482 ms

QueryID: 660db860-082c-4f0d-9a98-2810373c706a
Query: SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject WHERE ((_ResearchSubject.associated_project = 'TCGA-BRCA') AND (all_v2.days_to_birth > -50*365))
Offset: 0
Count: 100
Total Row Count: 288
More pages: True



In [5]:
q = idc.From(project.And(age))
r = q.run()
print(r)

Getting results from database

Total execution time: 14074 ms

QueryID: 367e05e8-83d5-47aa-b44f-d90cb438d0d8
Query: SELECT all_v2.* FROM (SELECT all_v2.* FROM gdc-bq-sample.integration.all_v2 AS all_v2, UNNEST(ResearchSubject) AS _ResearchSubject WHERE ((_ResearchSubject.associated_project = 'TCGA-BRCA') AND (all_v2.days_to_birth > -50*365))) AS all_v2, UNNEST(identifier) AS _identifier WHERE (_identifier.system = 'IDC')
Offset: 0
Count: 51
Total Row Count: 51
More pages: False



In [10]:
r.pretty_print(0)

{'File': [{'associated_project': 'TCGA-BRCA',
           'byte_size': '21968',
           'checksum': '9f8a71d0bd68b3d6f51da92919ef9c73',
           'data_category': 'Proteome Profiling',
           'data_type': 'Protein Expression Quantification',
           'drs_uri': 'drs://dg.4DFC:c840cf2d-36d4-4e0a-8053-4d8897cfa9cf',
           'file_format': 'TSV',
           'id': 'c840cf2d-36d4-4e0a-8053-4d8897cfa9cf',
           'identifier': [{'system': 'GDC',
                           'value': 'c840cf2d-36d4-4e0a-8053-4d8897cfa9cf'}],
           'label': 'TCGA-E2-A1B5-01A-31-A13E-20_RPPA_data.tsv'},
          {'associated_project': 'TCGA-BRCA',
           'byte_size': '175093',
           'checksum': '5f22ab47a7aa08352f38d1f275053f61',
           'data_category': 'Simple Nucleotide Variation',
           'data_type': 'Raw Simple Somatic Mutation',
           'drs_uri': 'drs://dg.4DFC:438d76d9-9248-4923-b6c4-276d8997093b',
           'file_format': 'VCF',
           'id': '438d76d9-9248-492

In [18]:
data_cat = {}
data_type = {}
for result in r:
    for file in result['File']:
        if file['data_type'] in data_type.keys():
            counter = data_type[file['data_type']]
            counter += 1
            data_type[file['data_type']] = counter
        else:
            data_type[file['data_type']] = 1
        if file['data_category'] in data_cat.keys():
            counter = data_cat[file['data_category']]
            counter += 1
            data_cat[file['data_category']] = counter
        else:
            data_cat[file['data_category']] = 1

In [19]:
print(data_cat)

{'Proteome Profiling': 45, 'Simple Nucleotide Variation': 784, 'Biospecimen': 612, 'Clinical': 512, 'Copy Number Variation': 377, 'Sequencing Reads': 245, 'Transcriptome Profiling': 333, 'DNA Methylation': 67, 'Imaging': 87608, 'Processed Mass Spectra': 361, 'Raw Mass Spectra': 366, 'Peptide Spectral Matches': 733}


In [20]:
print(data_type)

{'Protein Expression Quantification': 45, 'Raw Simple Somatic Mutation': 200, 'Biospecimen Supplement': 612, 'Clinical Supplement': 512, 'Masked Copy Number Segment': 110, 'Gene Level Copy Number': 53, 'Masked Somatic Mutation': 192, 'Aligned Reads': 245, 'Gene Expression Quantification': 201, 'Annotated Somatic Mutation': 200, 'Allele-specific Copy Number Segment': 53, 'Isoform Expression Quantification': 66, 'Aggregated Somatic Mutation': 192, 'miRNA Expression Quantification': 66, 'Methylation Beta Value': 67, 'Copy Number Segment': 110, 'Gene Level Copy Number Scores': 51, 'MR': 87549, 'SR': 59, 'Open Standard': 727, 'Proprietary': 366, 'Text': 367}
