In [1]:
import pandas as pd

In [2]:
class SegmentsDataProcessor:
    
    def __init__(self, filename):
        self.root_dir = '../'
        self.data = pd.read_csv(self.root_dir + filename, sep='\t', comment='#')
        self.data = self.process_data(self.data)
    
    
    def process_data(self, data):
        data = self.reshape_data(data)
        data = self.process_values(data)
        data = self.rename_chromosome_column(data)
        data = self.cast_column_types(data)
        data = self.remove_Y_rows(data)
        
        return data
        
        
    def get_cnv_segments(self, sex='female'):    
        cnv_data = self.data.copy()
        cnv_data = self.remove_not_cnv_rows(cnv_data)
        cnv_data = self.rename_event_column(cnv_data)
        cnv_data = self.transform_cn_values(cnv_data)
        cnv_data = self.cast_cn(cnv_data)
        if sex == 'male':
            cnv_data = self.correct_cn(cnv_data)
        cnv_data.reset_index(inplace=True, drop=True)
        
        return cnv_data
    
    
    def get_ai_segments(self):    
        ai_data = self.data.copy()
        ai_data = self.remove_not_ai_rows(ai_data)
        ai_data = self.drop_event_column(ai_data)
        ai_data.reset_index(inplace=True, drop=True)
        
        return ai_data


    def reshape_data(self, data):
        data = data.loc[:, 'Chromosome Region':'Length']

        data.loc[:,'Start'] = pd.Series('', index=data.index)
        data.loc[:,'End'] = pd.Series('', index=data.index)

        return data


    def process_values(self, data):
        data[['Chromosome Region', 'Start', 'End']] = data['Chromosome Region'].apply(self.process_chromosome_reg_col)
        
        data.loc[:,'Chromosome Region'] = data.loc[:,'Chromosome Region'].apply(lambda x: x.strip('chr'))
        data.loc[:,'Start'] = data.loc[:,'Start'].apply(lambda x: x.replace(',', ''))
        data.loc[:,'End'] = data.loc[:,'End'].apply(lambda x: x.replace(',', ''))
        data.loc[:,'Length'] = data.loc[:,'Length'].apply(lambda x: x - 1)

        return data


    def process_chromosome_reg_col(self, text):
        chromosome, rest = text.split(':')
        start, end = rest.split('-')
        
        return pd.Series([chromosome, start, end])


    def rename_chromosome_column(self, data):
        data.rename(columns = {'Chromosome Region':'Chromosome'}, inplace = True)

        return data


    def remove_Y_rows(self, data):
        data = data.loc[data['Chromosome'] != 'Y']

        return data


    def cast_column_types(self, data):
        data.loc[:,'Start'] = data.loc[:,'Start'].astype(str).astype('int64')
        data.loc[:,'End'] = data.loc[:,'End'].astype(str).astype('int64')

        return data

    
    def transform_cn_values(self, data):
        category_to_number = {'Copy Number' : {'Big Loss' : 0,
                                'CN Loss' : 1,
                                'CN Gain' : 3,
                                'High Copy Gain' : 4}
                         }
        data.replace(category_to_number, inplace=True)
        
        return data
    
    
    def remove_not_cnv_rows(self, data):
        data = data.loc[(data['Event'] != 'LOH') & (data['Event'] != 'Allelic Imbalance') & (data['Event'] != 'Homozygous Copy Loss')]

        return data
    
    
    def rename_event_column(self, data):
        data.rename(columns = {'Event':'Copy Number'}, inplace = True)

        return data
    
    
    def cast_cn(self, data):
        data.loc[:,'Copy Number'] = data.loc[:,'Copy Number'].astype(str).astype('int64')

        return data
    
    
    def correct_cn(self, data):
        data.loc[data['Chromosome'] == 'X', 'Copy Number'] = data.loc[data['Chromosome'] == 'X', 'Copy Number'].apply(lambda x: x - 1) 
        
        return data
    
    
    def remove_not_ai_rows(self, data):
        data = data.loc[data['Event'] == 'Allelic Imbalance']

        return data
    
    
    def drop_event_column(self, data):
        data.drop(columns='Event', inplace=True)
        
        return data

In [3]:
pd.set_option('display.max_rows', 1000)

In [4]:
sdp = SegmentsDataProcessor('datasets/P6.Inform/I062.015.WGS.Tumor_events.txt')

sdp.data

Unnamed: 0,Chromosome,Event,Length,Start,End
0,1,CN Gain,648442,0,648442
1,1,Allelic Imbalance,16032714,142535839,158568553
2,1,CN Gain,4657565,158568553,163226118
3,1,CN Gain,29721681,219528940,249250621
4,2,CN Gain,21620422,10768725,32389147
5,2,High Copy Gain,79537,33069397,33148934
6,2,CN Gain,1388890,33148934,34537824
7,2,CN Gain,2329165,37447690,39776855
8,2,CN Gain,1963185,41952444,43915629
9,2,CN Gain,495884,45657687,46153571


In [5]:
cnv_data = sdp.get_cnv_segments()

cnv_data

Unnamed: 0,Chromosome,Copy Number,Length,Start,End
0,1,3,648442,0,648442
1,1,3,4657565,158568553,163226118
2,1,3,29721681,219528940,249250621
3,2,3,21620422,10768725,32389147
4,2,4,79537,33069397,33148934
5,2,3,1388890,33148934,34537824
6,2,3,2329165,37447690,39776855
7,2,3,1963185,41952444,43915629
8,2,3,495884,45657687,46153571
9,2,3,413431,47006935,47420366


In [6]:
ai_data = sdp.get_ai_segments()

ai_data

Unnamed: 0,Chromosome,Length,Start,End
0,1,16032714,142535839,158568553
1,6,996425,57782542,58778967
2,9,502978,68070657,68573635
3,16,26342791,46386883,72729674
4,17,21747483,25265098,47012581
5,21,1454448,9658127,11112575


In [13]:
class SegmentsDataProcessor2(SegmentsDataProcessor):
    
    def __init__(self, filename, sample_name):
        self.root_dir = '../'
        self.sample_name = sample_name
        self.data =  pd.read_csv(self.root_dir + filename, sep='\t', header=None)
        self.data = self.process_data(self.data)
        

    def process_data(self, data):
        data = self.get_sample_data(data)
        data = self.reshape_data_2(data)
        data = self.add_header(data)
        
        return super().process_data(data)


    def get_sample_data(self, data):
        data = data.loc[data[0] == self.sample_name] # 'P6.Rec7'

        return data


    def reshape_data_2(self, data):
        data = data.loc[:, '1':'3']

        data.loc[:,'4'] = pd.Series('', index=data.index)
        data.loc[:,'5'] = pd.Series('', index=data.index)

        return data


    def add_header(self, data):
        data.columns = ['Chromosome Region', 'Event', 'Length', 'Start', 'End']

        return data

In [14]:
sdp2 = SegmentsDataProcessor2('datasets/p6.Inform/I062.033.WGS.Tumor_events.txt', 'P6.Rec7')

sdp2.data

Unnamed: 0,Chromosome,Event,Length,Start,End
0,1,CN Loss,157435,104172098,104329533
1,1,CN Loss,2534626,145295296,147829922
2,1,CN Loss,2068346,149849729,151918075
3,1,CN Gain,1352303,216006715,217359018
4,1,CN Gain,565566,222719682,223285248
5,1,CN Gain,5443247,224232394,229675641
6,1,CN Gain,3867195,235821067,239688262
7,1,CN Gain,4300807,244949814,249250621
8,2,CN Gain,22335482,10805837,33141319
9,2,Homozygous Copy Loss,7615,33141319,33148934


In [17]:
cnv_data2 = sdp2.get_cnv_segments()

cnv_data2

Unnamed: 0,Chromosome,Copy Number,Length,Start,End
0,1,1,157435,104172098,104329533
1,1,1,2534626,145295296,147829922
2,1,1,2068346,149849729,151918075
3,1,3,1352303,216006715,217359018
4,1,3,565566,222719682,223285248
5,1,3,5443247,224232394,229675641
6,1,3,3867195,235821067,239688262
7,1,3,4300807,244949814,249250621
8,2,3,22335482,10805837,33141319
9,2,3,593337,33148934,33742271


In [None]:
ai_data2 = sdp2.get_ai_segments()

ai_data2