In [1]:
import pandas as pd

In [14]:
class GapDataProcessor:
    
    def __init__(self, filename):
        self.root_dir = '../'
        self.data = pd.read_csv(self.root_dir + filename, sep='\t')
        self.data = self.process_data(self.data)
        
    def process_data(self, data):
        data = data.loc[(data['type'] == 'centromere') | (data['type'] == 'telomere'), ['chrom', 'chromStart','chromEnd', 'type']]
        data.rename(columns = { 'chrom': 'Chromosome', 'chromStart': 'Start', 'chromEnd': 'End' }, inplace = True)
        data.iloc[:, 0] = data.loc[:, 'Chromosome'].apply(lambda x: x.replace('chr', ''))

        return data
    
    
    def get_centromeres(self):
        centromeres = self.data.copy()
        centromeres = centromeres.loc[centromeres['type'] == 'centromere']
        centromeres.drop(columns='type', inplace=True)
        centromeres.reset_index(drop=True, inplace=True)

        return centromeres
    
    
    def get_telomeres(self):
        telomeres = self.data.copy()
        telomeres = telomeres.loc[telomeres['type'] == 'telomere']
        telomeres.drop(columns='type', inplace=True)
        telomeres = telomeres.reset_index(drop=True)

        return telomeres

In [15]:
pd.set_option('display.max_rows', 1000)

In [16]:
gdp = GapDataProcessor('../datasets/gap.txt')

gdp.data

Unnamed: 0,Chromosome,Start,End,type
1,1,121535434,124535434,centromere
19,1,0,10000,telomere
40,1,249240621,249250621,telomere
42,2,92326171,95326171,centromere
49,2,0,10000,telomere
58,2,243189373,243199373,telomere
59,3,0,10000,telomere
62,3,90504854,93504854,centromere
65,3,198012430,198022430,telomere
66,4,49660117,52660117,centromere


In [17]:
centromeres = gdp.get_centromeres()

centromeres

Unnamed: 0,Chromosome,Start,End
0,1,121535434,124535434
1,2,92326171,95326171
2,3,90504854,93504854
3,4,49660117,52660117
4,5,46405641,49405641
5,6,58830166,61830166
6,7,58054331,61054331
7,8,43838887,46838887
8,9,47367679,50367679
9,X,58632012,61632012


In [18]:
telomeres = gdp.get_telomeres()

telomeres

Unnamed: 0,Chromosome,Start,End
0,1,0,10000
1,1,249240621,249250621
2,2,0,10000
3,2,243189373,243199373
4,3,0,10000
5,3,198012430,198022430
6,4,0,10000
7,4,191144276,191154276
8,5,0,10000
9,5,180905260,180915260
