In [12]:
import pandas as pd
from os import listdir
from statistics import mean

In [15]:
listdir('data')

['OV-US',
 'organize_data.py',
 'COAD-US',
 '.DS_Store',
 'list_data_dirs.py',
 'UCEC-US',
 'PRAD-US',
 'KIRC-US',
 'SKCM-US',
 'THCA-US',
 'LGG-US',
 'GBM-US',
 'LUSC-US',
 'BLCA-US',
 'BRCA-US',
 '.remove_duplicates.py.swo']

In [40]:
def look(cancer_code):
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    feat = []
    ids = []
    donors = []
    
    all_files = [file for file in listdir(PATH)]
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        feat.append(len(df.columns))
        ids.append(len(set(df['icgc_mutation_id'])))
        donors.append(len(set(df['icgc_donor_id'])))
    
    print('*' * 40)
    print('Project code: ', cancer_code)
    
    try:
        print('Average number of features: {}'.format(mean(feat)))
        print('Average number of mutations: {}'.format(mean(ids)))
        print('Number of donors: {}'.format(sum(donors)))
    except:
        print(NULL)

In [41]:
folders = [file for file in listdir('data') if 'US' in file]
for cancer in folders:
    look(cancer)

****************************************
Project code:  OV-US
Average number of features: 42
Average number of mutations: 105.53217821782178
Number of donors: 404
****************************************
Project code:  COAD-US
Average number of features: 42
Average number of mutations: 1949.2
Number of donors: 280
****************************************
Project code:  UCEC-US
Average number of features: 42
Average number of mutations: 3093.343108504399
Number of donors: 341
****************************************
Project code:  PRAD-US
Average number of features: 42
Average number of mutations: 25.90277777777778
Number of donors: 144
****************************************
Project code:  KIRC-US
Average number of features: 42
Average number of mutations: 106.89270386266094
Number of donors: 233
****************************************
Project code:  SKCM-US
Average number of features: 42
Average number of mutations: 1890.2366666666667
Number of donors: 300
**************************

### Reading in each file
Each file represents one individual. We could probably use the project_code feature as our target/output.

In [3]:
df = pd.read_csv('data/BRCA-US/simple_somatic_mutation.open-2020-03-02T154602.752.tsv', sep='\t')

In [4]:
print('Number of features: {}'.format(len(df.columns)))
print(df.columns)

Number of features: 42
Index(['icgc_mutation_id', 'icgc_donor_id', 'project_code', 'icgc_specimen_id',
       'icgc_sample_id', 'matched_icgc_sample_id', 'submitted_sample_id',
       'submitted_matched_sample_id', 'chromosome', 'chromosome_start',
       'chromosome_end', 'chromosome_strand', 'assembly_version',
       'mutation_type', 'reference_genome_allele', 'mutated_from_allele',
       'mutated_to_allele', 'quality_score', 'probability', 'total_read_count',
       'mutant_allele_read_count', 'verification_status',
       'verification_platform', 'biological_validation_status',
       'biological_validation_platform', 'consequence_type', 'aa_mutation',
       'cds_mutation', 'gene_affected', 'transcript_affected',
       'gene_build_version', 'platform', 'experimental_protocol',
       'sequencing_strategy', 'base_calling_algorithm', 'alignment_algorithm',
       'variation_calling_algorithm', 'other_analysis_algorithm',
       'seq_coverage', 'raw_data_repository', 'raw_data_acc

In [5]:
df.head(5)

Unnamed: 0,icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,...,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
0,MU22195,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,5,58021869,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,
1,MU22195,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,5,58021869,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,
2,MU22195,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,5,58021869,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
3,MU22195,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,5,58021869,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
4,MU23549,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,7,70886058,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,


Notice how for one donor there are duplicate icgc_mutation_id values. Not sure what this means.

In [6]:
# Mutation ID MU22195
df[df['icgc_mutation_id'] == 'MU22195']

Unnamed: 0,icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,...,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
0,MU22195,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,5,58021869,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,
1,MU22195,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,5,58021869,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,
2,MU22195,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,5,58021869,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
3,MU22195,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,5,58021869,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,


In [7]:
# Mutation ID MU23549
df[df['icgc_mutation_id'] == 'MU23549']

Unnamed: 0,icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,...,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
4,MU23549,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,7,70886058,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
5,MU23549,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,7,70886058,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
6,MU23549,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,7,70886058,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
7,MU23549,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,7,70886058,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,
8,MU23549,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,7,70886058,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,
9,MU23549,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,7,70886058,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,


For the files I looked at (which was only a few so far) there are 42 features.

# Some EDA

In [13]:
df.columns

Index(['icgc_mutation_id', 'icgc_donor_id', 'project_code', 'icgc_specimen_id',
       'icgc_sample_id', 'matched_icgc_sample_id', 'submitted_sample_id',
       'submitted_matched_sample_id', 'chromosome', 'chromosome_start',
       'chromosome_end', 'chromosome_strand', 'assembly_version',
       'mutation_type', 'reference_genome_allele', 'mutated_from_allele',
       'mutated_to_allele', 'quality_score', 'probability', 'total_read_count',
       'mutant_allele_read_count', 'verification_status',
       'verification_platform', 'biological_validation_status',
       'biological_validation_platform', 'consequence_type', 'aa_mutation',
       'cds_mutation', 'gene_affected', 'transcript_affected',
       'gene_build_version', 'platform', 'experimental_protocol',
       'sequencing_strategy', 'base_calling_algorithm', 'alignment_algorithm',
       'variation_calling_algorithm', 'other_analysis_algorithm',
       'seq_coverage', 'raw_data_repository', 'raw_data_accession',
       'initia

# Choices of features
- Chromosome start and end may not be helpful though there could exist a relationship between which chromosome and the position of mutation. 
- Do not know what CDS mutation or AA mutation is. Many NaNs here.
- Do not know how the gene_effected feature is encoded.
- Project_code can be the target output.
- Possibly remove start and end features with length of mutation. If a deletion then we can represent with negative number.

In [17]:
# Choice of features here is due to quick look at the columns. Change as you see fit.
features = ['icgc_mutation_id', 
            'icgc_donor_id', 
            'chromosome', 
            'chromosome_start',
            'chromosome_end',
            'chromosome_strand',
            'reference_genome_allele',
            'mutated_from_allele',
            'mutated_to_allele',
            'consequence_type',
            #'aa_mutation',
            #'cds_mutation',
            'gene_affected',
            #'quality_score',
            #'probability',
            'total_read_count',
            #'seq_coverage', 
            'project_code']
df[features]

Unnamed: 0,icgc_mutation_id,icgc_donor_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,reference_genome_allele,mutated_from_allele,mutated_to_allele,consequence_type,gene_affected,total_read_count,project_code
0,MU22195,DO5200,5,58021869,58021869,1,C,C,A,exon_variant,ENSG00000152932,68,BRCA-US
1,MU22195,DO5200,5,58021869,58021869,1,C,C,A,missense_variant,ENSG00000152932,68,BRCA-US
2,MU22195,DO5200,5,58021869,58021869,1,C,C,A,exon_variant,ENSG00000152932,349,BRCA-US
3,MU22195,DO5200,5,58021869,58021869,1,C,C,A,missense_variant,ENSG00000152932,349,BRCA-US
4,MU23549,DO5200,7,70886058,70886058,1,G,G,A,stop_gained,ENSG00000185274,53,BRCA-US
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,MU79396,DO5200,3,48665399,48665399,1,C,C,T,downstream_gene_variant,ENSG00000225697,292,BRCA-US
2193,MU79396,DO5200,3,48665399,48665399,1,C,C,T,downstream_gene_variant,ENSG00000225697,292,BRCA-US
2194,MU79396,DO5200,3,48665399,48665399,1,C,C,T,downstream_gene_variant,ENSG00000225697,292,BRCA-US
2195,MU79396,DO5200,3,48665399,48665399,1,C,C,T,downstream_gene_variant,ENSG00000225697,292,BRCA-US


# Mutation IDs
- There are many duplicates. There are only 213 unique mutation IDs for this donor.

In [23]:
print('There are {} unique mutation IDs'.format(len(set(df[features]['icgc_mutation_id']))))

There are 213 unique mutation IDs


In [22]:
print('There is only {} donor in this file'.format(len(set(df[features]['icgc_donor_id']))))

There is only 1 donor in this file


In [9]:
data = df[features]

In [10]:
data.head()

Unnamed: 0,icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,matched_icgc_sample_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,assembly_version,...,probability,total_read_count,mutant_allele_read_count,verification_status,consequence_type,aa_mutation,cds_mutation,gene_affected,transcript_affected,gene_build_version
0,MU22195,DO5200,BRCA-US,SP10944,SA60348,5,58021869,58021869,1,GRCh37,...,,68,12,not tested,exon_variant,,,ENSG00000152932,ENST00000507977,75
1,MU22195,DO5200,BRCA-US,SP10944,SA60348,5,58021869,58021869,1,GRCh37,...,,68,12,not tested,missense_variant,A98D,293C>A,ENSG00000152932,ENST00000282878,75
2,MU22195,DO5200,BRCA-US,SP10944,SA60324,5,58021869,58021869,1,GRCh37,...,,349,49,not tested,exon_variant,,,ENSG00000152932,ENST00000507977,75
3,MU22195,DO5200,BRCA-US,SP10944,SA60324,5,58021869,58021869,1,GRCh37,...,,349,49,not tested,missense_variant,A98D,293C>A,ENSG00000152932,ENST00000282878,75
4,MU23549,DO5200,BRCA-US,SP10944,SA60324,7,70886058,70886058,1,GRCh37,...,,53,37,not tested,stop_gained,W310*,929G>A,ENSG00000185274,ENST00000333538,75


## Not all mutations are 1bp
Some mutations involve longer sequences!

In [11]:
df[df['mutation_type'] == 'deletion of <=200bp'][['mutation_type', 'reference_genome_allele', 'mutated_from_allele', 'mutated_to_allele']].head(5)

Unnamed: 0,mutation_type,reference_genome_allele,mutated_from_allele,mutated_to_allele
159,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-
160,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-
161,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-
162,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-
163,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-


In [12]:
set(df['consequence_type'])

{'3_prime_UTR_variant',
 '5_prime_UTR_premature_start_codon_gain_variant',
 '5_prime_UTR_variant',
 'disruptive_inframe_deletion',
 'downstream_gene_variant',
 'exon_variant',
 'frameshift_variant',
 'inframe_deletion',
 'intron_variant',
 'missense_variant',
 'splice_acceptor_variant',
 'splice_donor_variant',
 'splice_region_variant',
 'stop_gained',
 'synonymous_variant',
 'upstream_gene_variant'}