In [1]:
import pandas as pd
from os import listdir
from statistics import mean
import numpy as np

In [2]:
listdir('data')

['OV-US',
 'organize_data.py',
 'COAD-US',
 '.DS_Store',
 'list_data_dirs.py',
 'UCEC-US',
 'PRAD-US',
 'KIRC-US',
 'SKCM-US',
 'THCA-US',
 'LGG-US',
 'GBM-US',
 'LUSC-US',
 'BLCA-US',
 'BRCA-US',
 '.remove_duplicates.py.swo']

In [11]:
# Gets unique mutation ids with number of occurrences
# Returns new dataframe with cancer type, donor id, mutation ids and corresponding counts
def mutation_counts(df):
    cancer_type = df.project_code[0]
    donor_id = df.icgc_donor_id[0] + cancer_type
    smaller_df = df[['icgc_mutation_id']]
    unique_ids, counts = np.unique(smaller_df.to_numpy(), 
                                   return_counts=True,
                                  )
    new_df = pd.DataFrame({"CancerType": cancer_type,
                           "DonorId": donor_id,
                           "MutationId": unique_ids, 
                           "Count": counts})
    return new_df

In [12]:
# The mutation counts for a cancer type is returned
def get_mutation_frames(cancer_code):
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    all_files = [file for file in listdir(PATH)]
    
    final_df = None
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        final_df = pd.concat([final_df, mutation_counts(df)])
        
    return final_df

In [13]:
# Go through all cancer types and get mutation dataframes
# Concatenate all dataframes
# This results in one dataframe for all cancer types

folders = [file for file in listdir('data') if 'US' in file]
all_cancer_df = None
for cancer in folders:
    all_cancer_df = pd.concat([all_cancer_df, 
                               get_mutation_frames(cancer)])
    
all_cancer_df.reset_index(drop=True, inplace=True)

  if (await self.run_code(code, result,  async_=asy)):


In [14]:
all_cancer_df

Unnamed: 0,CancerType,DonorId,MutationId,Count
0,OV-US,DO28611OV-US,MU129325554,4
1,OV-US,DO28611OV-US,MU130100811,29
2,OV-US,DO28611OV-US,MU130473495,3
3,OV-US,DO28611OV-US,MU130475900,17
4,OV-US,DO28611OV-US,MU130477835,8
...,...,...,...,...
2534368,BRCA-US,DO5857BRCA-US,MU5271753,16
2534369,BRCA-US,DO5857BRCA-US,MU5272943,7
2534370,BRCA-US,DO5857BRCA-US,MU5272953,7
2534371,BRCA-US,DO5857BRCA-US,MU5468694,7


In [23]:
unique_mutations, counts = np.unique(np.array(all_cancer_df.MutationId), return_counts=True)

In [33]:
mutation_df = pd.DataFrame( {'MutationID': unique_mutations, 
                             'Counts': counts} )

In [42]:
mutation_df_filtered = mutation_df[mutation_df.Counts > 4]

In [43]:
mutation_df_filtered.sort_values(by=['Counts'], inplace=True)
mutation_df_filtered

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,MutationID,Counts
1186458,MU1899169,5
1236057,MU1957569,5
1236120,MU1957631,5
1236182,MU1957694,5
1236245,MU1957756,5
...,...,...
1424496,MU4468,153
1631077,MU866,238
1587418,MU62030,365
1046482,MU131898417,400


In [44]:
mutation_ids = mutation_df_filtered.MutationID

In [48]:
master_mutations = np.array(mutation_ids)

In [58]:
columns = ['DonorIDs', 'CancerType'] + list(master_mutations)
df = pd.DataFrame(columns = columns)
temp = np.where(master_mutations == 'IS675')
np.zeros((len(master_mutations)))

array([0., 0., 0., ..., 0., 0., 0.])

In [59]:
def helper(df):
    cancer_type = df.project_code[0]
    donor_id = df.icgc_donor_id[0] + cancer_type
    smaller_df = df[['icgc_mutation_id']]
    unique_ids = np.unique(smaller_df.to_numpy())
    return cancer_type, donor_id, unique_ids

In [77]:
def katrinas_function(cancer_code, master_mutations):
    
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    all_files = [file for file in listdir(PATH)]
    columns = ['DonorIDs', 'CancerType'] + list(master_mutations)
    final_df = pd.DataFrame(columns = columns)
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        cancer_type, donor_id, unique_ids = helper(df)
        binary = np.zeros((len(master_mutations)))
        
        for ids in unique_ids:
            temp = np.where(master_mutations == ids)
            if len(temp) > 0:
                binary[temp[0]] = 1
        
        row = [donor_id, cancer_type] + list(binary)
        final_df = pd.concat([final_df, pd.DataFrame([row], columns=columns)], ignore_index=True)
    
    return final_df



In [78]:
katrinas_function('BRCA-US', master_mutations)

Unnamed: 0,DonorIDs,CancerType,MU1899169,MU1957569,MU1957631,MU1957694,MU1957756,MU1957895,MU1957974,MU1958009,...,MU130696800,MU122201,MU129795540,MU129540995,MU4885648,MU4468,MU866,MU62030,MU131898417,MU131867962
0,DO5944BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,DO3067BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DO50036BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DO3275BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DO2978BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,DO1697BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1014,DO3793BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1015,DO3400BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1016,DO219593BRCA-US,BRCA-US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
unique_donors = np.unique(np.array(all_cancer_df.DonorId))

In [18]:
len(unique_donors)

3364

In [None]:
all_cancer_df_filtered = all_cancer_df[all_cancer_df.Count]

In [3]:
"""
This function looks at each class and computes the number of donors, 
average number of mutations and features.
"""
def look(cancer_code):
    DATA_PATH = 'data/'
    PATH = DATA_PATH + cancer_code + '/'
    
    feat = []
    ids = []
    donors = []
    chromosomes = []
    
    all_files = [file for file in listdir(PATH)]
    
    for file in all_files:
        df = pd.read_csv(PATH + file, sep='\t')
        feat.append(len(df.columns))
        ids.append(len(set(df['icgc_mutation_id'])))
        donors.append(len(set(df['icgc_donor_id'])))
        
        # Repeatedly union the set of chromosomes for each donor
        chromosomes += list(df['chromosome'].apply(str))
    
    print('*' * 40)
    print('Project code: ', cancer_code)
    
    try:
        print('Average number of features: {}'.format(mean(feat)))
        print('Average number of mutations: {}'.format(mean(ids)))
        print('Number of donors: {}'.format(sum(donors)))
        print('Chromosomes: {}'.format(set(chromosomes)))
    except:
        print(NULL)

In [16]:
folders = [file for file in listdir('data') if 'US' in file]
for cancer in folders:
    look(cancer)

****************************************
Project code:  OV-US
Average number of features: 42
Average number of mutations: 105.53217821782178
Number of donors: 404
Chromosomes: {'3', '5', '10', '1', '20', '19', '12', '11', '16', '9', '14', '18', '8', '13', '4', '7', '15', 'X', '22', '17', '21', '2', '6'}
****************************************
Project code:  COAD-US
Average number of features: 42
Average number of mutations: 1949.2
Number of donors: 280
Chromosomes: {'3', '5', '10', '1', '20', '19', '12', '11', '16', '14', '9', '18', 'Y', '8', '13', '4', '7', '15', 'X', '22', '17', '21', '2', '6'}
****************************************
Project code:  UCEC-US
Average number of features: 42
Average number of mutations: 3093.343108504399
Number of donors: 341
Chromosomes: {'3', '5', '10', '1', '20', '19', '12', '11', '16', '14', '9', '18', '8', '13', '4', '7', '15', 'X', '22', '17', '21', '2', '6'}
****************************************
Project code:  PRAD-US
Average number of feature

### Reading in each file
Each file represents one individual. We could probably use the project_code feature as our target/output.

In [4]:
df = pd.read_csv('data/BRCA-US/simple_somatic_mutation.open-2020-03-02T154602.752.tsv', sep='\t')

In [5]:
print('Number of features: {}'.format(len(df.columns)))
print(df.columns)

Number of features: 42
Index(['icgc_mutation_id', 'icgc_donor_id', 'project_code', 'icgc_specimen_id',
       'icgc_sample_id', 'matched_icgc_sample_id', 'submitted_sample_id',
       'submitted_matched_sample_id', 'chromosome', 'chromosome_start',
       'chromosome_end', 'chromosome_strand', 'assembly_version',
       'mutation_type', 'reference_genome_allele', 'mutated_from_allele',
       'mutated_to_allele', 'quality_score', 'probability', 'total_read_count',
       'mutant_allele_read_count', 'verification_status',
       'verification_platform', 'biological_validation_status',
       'biological_validation_platform', 'consequence_type', 'aa_mutation',
       'cds_mutation', 'gene_affected', 'transcript_affected',
       'gene_build_version', 'platform', 'experimental_protocol',
       'sequencing_strategy', 'base_calling_algorithm', 'alignment_algorithm',
       'variation_calling_algorithm', 'other_analysis_algorithm',
       'seq_coverage', 'raw_data_repository', 'raw_data_acc

Notice how for one donor there are duplicate icgc_mutation_id values. Not sure what this means.

Number of muations and genes affected do not match :(

In [30]:
genes, counts = np.unique(df.gene_affected.to_numpy(), return_counts=True)
genes.shape

(294,)

In [7]:
# Mutation ID MU23549
df[df['icgc_mutation_id'] == 'MU23549']

Unnamed: 0,icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,icgc_sample_id,matched_icgc_sample_id,submitted_sample_id,submitted_matched_sample_id,chromosome,chromosome_start,...,experimental_protocol,sequencing_strategy,base_calling_algorithm,alignment_algorithm,variation_calling_algorithm,other_analysis_algorithm,seq_coverage,raw_data_repository,raw_data_accession,initial_data_release_date
4,MU23549,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,7,70886058,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
5,MU23549,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,7,70886058,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
6,MU23549,DO5200,BRCA-US,SP10944,SA60228,SA60324,TCGA-BH-A18U-01A-21D-A12B-09,TCGA-BH-A18U-11A-23D-A12B-09,7,70886058,...,,WXS,,,TCGA-MC3 https://gdc.cancer.gov/about-data/pub...,,,GDC,TCGA-BH-A18U-01A-21D-A12B-09,
7,MU23549,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,7,70886058,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,
8,MU23549,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,7,70886058,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,
9,MU23549,DO5200,BRCA-US,SP10944,SA60252,SA60348,TCGA-BH-A18U-01A-21D-A19H-09,TCGA-BH-A18U-11A-23D-A19H-09,7,70886058,...,,WGS,,,PCAWG Consensus SNV-MNV caller,,,,FI30202:FI30201,


For the files I looked at (which was only a few so far) there are 42 features.

# Some EDA

In [13]:
df.columns

Index(['icgc_mutation_id', 'icgc_donor_id', 'project_code', 'icgc_specimen_id',
       'icgc_sample_id', 'matched_icgc_sample_id', 'submitted_sample_id',
       'submitted_matched_sample_id', 'chromosome', 'chromosome_start',
       'chromosome_end', 'chromosome_strand', 'assembly_version',
       'mutation_type', 'reference_genome_allele', 'mutated_from_allele',
       'mutated_to_allele', 'quality_score', 'probability', 'total_read_count',
       'mutant_allele_read_count', 'verification_status',
       'verification_platform', 'biological_validation_status',
       'biological_validation_platform', 'consequence_type', 'aa_mutation',
       'cds_mutation', 'gene_affected', 'transcript_affected',
       'gene_build_version', 'platform', 'experimental_protocol',
       'sequencing_strategy', 'base_calling_algorithm', 'alignment_algorithm',
       'variation_calling_algorithm', 'other_analysis_algorithm',
       'seq_coverage', 'raw_data_repository', 'raw_data_accession',
       'initia

# Choices of features
- Chromosome start and end may not be helpful though there could exist a relationship between which chromosome and the position of mutation. 
- Do not know what CDS mutation or AA mutation is. Many NaNs here.
- Do not know how the gene_effected feature is encoded.
- Project_code can be the target output.
- Possibly remove start and end features with length of mutation. If a deletion then we can represent with negative number.

In [49]:
# Choice of features here is due to quick look at the columns. 
# Change as you see fit.

features = ['icgc_mutation_id', 
            'icgc_donor_id',
            'icgc_sample_id', 
            'matched_icgc_sample_id', 
            'submitted_sample_id',
            'submitted_matched_sample_id',
            'chromosome', 
            'chromosome_start',
            'chromosome_end',
            'chromosome_strand',
            'reference_genome_allele',
            'mutated_from_allele',
            'mutated_to_allele',
            'consequence_type',
            'gene_affected',
            'total_read_count',
            'project_code']

np.unique(df[features].chromosome_strand)
df[features][['chromosome', 
              'chromosome_start', 
              'reference_genome_allele', 
              'mutated_from_allele', 
              'mutated_to_allele']]

# why would donor have two submitted samples or more?


Unnamed: 0,chromosome,chromosome_start,reference_genome_allele,mutated_from_allele,mutated_to_allele
0,5,58021869,C,C,A
1,5,58021869,C,C,A
2,5,58021869,C,C,A
3,5,58021869,C,C,A
4,7,70886058,G,G,A
...,...,...,...,...,...
2192,3,48665399,C,C,T
2193,3,48665399,C,C,T
2194,3,48665399,C,C,T
2195,3,48665399,C,C,T


# Mutation IDs
- There are many duplicates. There are only 213 unique mutation IDs for this donor.

In [23]:
print('There are {} unique mutation IDs'.format(len(set(df[features]['icgc_mutation_id']))))

There are 213 unique mutation IDs


In [22]:
print('There is only {} donor in this file'.format(len(set(df[features]['icgc_donor_id']))))

There is only 1 donor in this file


In [9]:
data = df[features]

In [10]:
data.head()

Unnamed: 0,icgc_mutation_id,icgc_donor_id,project_code,icgc_specimen_id,matched_icgc_sample_id,chromosome,chromosome_start,chromosome_end,chromosome_strand,assembly_version,...,probability,total_read_count,mutant_allele_read_count,verification_status,consequence_type,aa_mutation,cds_mutation,gene_affected,transcript_affected,gene_build_version
0,MU22195,DO5200,BRCA-US,SP10944,SA60348,5,58021869,58021869,1,GRCh37,...,,68,12,not tested,exon_variant,,,ENSG00000152932,ENST00000507977,75
1,MU22195,DO5200,BRCA-US,SP10944,SA60348,5,58021869,58021869,1,GRCh37,...,,68,12,not tested,missense_variant,A98D,293C>A,ENSG00000152932,ENST00000282878,75
2,MU22195,DO5200,BRCA-US,SP10944,SA60324,5,58021869,58021869,1,GRCh37,...,,349,49,not tested,exon_variant,,,ENSG00000152932,ENST00000507977,75
3,MU22195,DO5200,BRCA-US,SP10944,SA60324,5,58021869,58021869,1,GRCh37,...,,349,49,not tested,missense_variant,A98D,293C>A,ENSG00000152932,ENST00000282878,75
4,MU23549,DO5200,BRCA-US,SP10944,SA60324,7,70886058,70886058,1,GRCh37,...,,53,37,not tested,stop_gained,W310*,929G>A,ENSG00000185274,ENST00000333538,75


## Not all mutations are 1bp
Some mutations involve longer sequences!

In [11]:
df[df['mutation_type'] == 'deletion of <=200bp'][['mutation_type', 'reference_genome_allele', 'mutated_from_allele', 'mutated_to_allele']].head(5)

Unnamed: 0,mutation_type,reference_genome_allele,mutated_from_allele,mutated_to_allele
159,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-
160,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-
161,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-
162,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-
163,deletion of <=200bp,TGCGGAGATTCTCTTCCTC,TGCGGAGATTCTCTTCCTC,-


In [12]:
set(df['consequence_type'])

{'3_prime_UTR_variant',
 '5_prime_UTR_premature_start_codon_gain_variant',
 '5_prime_UTR_variant',
 'disruptive_inframe_deletion',
 'downstream_gene_variant',
 'exon_variant',
 'frameshift_variant',
 'inframe_deletion',
 'intron_variant',
 'missense_variant',
 'splice_acceptor_variant',
 'splice_donor_variant',
 'splice_region_variant',
 'stop_gained',
 'synonymous_variant',
 'upstream_gene_variant'}