In [1]:
import pandas as pd
from os.path import join
import sys
import os
sys.path.append("/home/sunzehui/GeneNet/")
from config_path import *


In [2]:
processed_dir = 'processed'
data_dir = 'raw_data'

processed_dir = join(DATA_PATH, processed_dir)
data_dir = join(DATA_PATH, data_dir)

In [3]:
print(processed_dir)
print(data_dir)

/home/sunzehui/GeneNet/_database/processed
/home/sunzehui/GeneNet/_database/raw_data


In [4]:
def prepare_design_matrix_crosstable():
    print('preparing mutations ...')

    filename = '41588_2018_78_MOESM4_ESM.txt'
    id_col = 'Tumor_Sample_Barcode'
    df = pd.read_csv(join(data_dir, filename), sep='\t', low_memory=False, skiprows=1)
    print ('mutation distribution')
    print (df['Variant_Classification'].value_counts())

    if filter_silent_muts:
        df = df[df['Variant_Classification'] != 'Silent'].copy()
    if filter_missense_muts:
        df = df[df['Variant_Classification'] != 'Missense_Mutation'].copy()
    if filter_introns_muts:
        df = df[df['Variant_Classification'] != 'Intron'].copy()

    # important_only = ['Missense_Mutation', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Splice_Site','Frame_Shift_Ins', 'In_Frame_Del', 'In_Frame_Ins', 'Start_Codon_SNP','Nonstop_Mutation', 'De_novo_Start_OutOfFrame', 'De_novo_Start_InFrame']
    exclude = ['Silent', 'Intron', "3\'UTR", "5\'UTR", 'RNA', 'lincRNA']
    if keep_important_only:
        df = df[~df['Variant_Classification'].isin(exclude)].copy()
    if truncating_only:
        include = ['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins']
        df = df[df['Variant_Classification'].isin(include)].copy()
    df_table = pd.pivot_table(data=df, index=id_col, columns='Hugo_Symbol', values='Variant_Classification',
                              aggfunc='count')
    df_table = df_table.fillna(0)
    total_numb_mutations = df_table.sum().sum()

    number_samples = df_table.shape[0]
    print('number of mutations', total_numb_mutations, total_numb_mutations / (number_samples + 0.0))
    filename = join(processed_dir, 'somatic_mutations' + ext + '.csv')
    df_table.to_csv(filename)


def prepare_response():
    print('preparing response ...')
    filename = '41588_2018_78_MOESM5_ESM.xlsx'
    df = pd.read_excel(join(data_dir, filename), sheet_name='Supplementary_Table3.txt', skiprows=2)
    response = pd.DataFrame()
    response['id'] = df['Patient.ID']
    response['response'] = df['Sample.Type']
    response['response'] = response['response'].replace('Metastasis', 1)
    response['response'] = response['response'].replace('Primary', 0)
    response = response.drop_duplicates()
    response.to_csv(join(processed_dir, 'response.csv'), index=False)


def prepare_cnv():
    print('preparing copy number variants ...')
    filename = '41588_2018_78_MOESM10_ESM.txt'
    df = pd.read_csv(join(data_dir, filename), sep='\t', low_memory=False, skiprows=1, index_col=0)
    df = df.T
    df = df.fillna(0.)
    filename = join(processed_dir, 'CNV.csv')
    df.to_csv(filename)


def prepare_cnv_burden():
    print('preparing copy number burden ...')
    filename = '41588_2018_78_MOESM5_ESM.xlsx'
    df = pd.read_excel(join(data_dir, filename), skiprows=2, index_col=1)
    cnv = df['Fraction of genome altered']
    filename = join(processed_dir, 'CNV_burden.csv')
    cnv.to_frame().to_csv(filename)


# remove silent and intron mutations
filter_silent_muts = False
filter_missense_muts = False
filter_introns_muts = False
keep_important_only = True
truncating_only = False

ext = ""
if keep_important_only:
    ext = '_important_only'

if truncating_only:
    ext = 'truncating_only'

if filter_silent_muts:
    ext = "_no_silent"

if filter_missense_muts:
    ext = ext + "_no_missense"

if filter_introns_muts:
    ext = ext + "_no_introns"

prepare_design_matrix_crosstable()
prepare_cnv()
prepare_response()
prepare_cnv_burden()
print('Done')

preparing mutations ...
mutation distribution
Missense_Mutation           51002
Silent                      21346
Intron                      11385
Nonsense_Mutation            2830
Frame_Shift_Del              2755
Splice_Site                  2695
3'UTR                        1200
Frame_Shift_Ins               915
In_Frame_Del                  585
5'UTR                         445
In_Frame_Ins                   79
RNA                            42
Nonstop_Mutation               40
Stop_Codon_Del                 13
Start_Codon_Del                11
De_novo_Start_OutOfFrame        5
De_novo_Start_InFrame           2
Start_Codon_Ins                 2
Stop_Codon_Ins                  1
lincRNA                         1
Name: Variant_Classification, dtype: int64
number of mutations 60935.0 60.27200791295747
preparing copy number variants ...
preparing response ...
preparing copy number burden ...
Done


  for idx, row in parser.parse():
  for idx, row in parser.parse():


## Check the processed data

In [5]:
DATA_PATH

'/home/sunzehui/GeneNet/_database'

`CNV.csv` relects how each tumer sample barcode's gene copy number variation situation.

In [6]:
cnv = pd.read_csv(os.path.join(DATA_PATH, "processed/CNV.csv") ) 
cnv

Unnamed: 0.1,Unnamed: 0,PIK3CD,MTOR,JUN,NRAS,NOTCH2,RIT1,NTRK1,DDR2,MDM4,...,MCAM,RNF26,C1QTNF5,MFRP,USP2,LOC100499227,THY1,PVRL1,TRIM29,OAF
0,00-029N9_LN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,01-087MM_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,01-095N1_LN,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,01-120A1_LIVER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
4,02-083E1_LN,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008,TP_2069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1009,TP_2077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0
1010,TP_2078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1011,TP_2079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


`cnv_burden.csv` reflects how each tumor sample barcode's burden situation, it's calculated from cnvs themselves.

In [7]:
cnv_burden = pd.read_csv(os.path.join(DATA_PATH, 'processed/CNV_burden.csv'))
cnv_burden.head()

Unnamed: 0,Patient.ID,Fraction of genome altered
0,AAPC-STID0000011640-Tumor-SM-2XU1H,0.010487
1,AAPC-STID0000021561-Tumor-SM-3RVWB,0.135831
2,AAPC-STID0000011949-Tumor-SM-2XU1I,0.190097
3,AAPC-STID0000021610-Tumor-SM-2XU13,0.054238
4,AAPC-STID0000021537-Tumor-SM-3RVW7,0.054551
