In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 1000)

In [3]:
data = pd.read_csv('../datasets/centromeres.txt', sep='\t', header=None)

data

Unnamed: 0,0,1,2,3,4
0,189,chr1,122026459,122224535,GJ211836.1
1,189,chr1,122224635,122503147,GJ211837.1
2,23,chr1,122503247,124785432,GJ212202.1
3,1537,chr1,124785532,124849129,GJ211855.1
4,192,chr1,124849229,124932724,GJ211857.1
5,13,chr10,39686682,39935900,GJ211930.1
6,13,chr10,39936000,41497440,GJ211932.1
7,901,chr10,41497540,41545720,GJ211933.1
8,112,chr10,41545820,41593521,GJ211936.1
9,974,chr11,51078348,51090317,GJ211938.1


In [4]:
data_without_cols = data.iloc[:, 1:4]

data_without_cols

Unnamed: 0,1,2,3
0,chr1,122026459,122224535
1,chr1,122224635,122503147
2,chr1,122503247,124785432
3,chr1,124785532,124849129
4,chr1,124849229,124932724
5,chr10,39686682,39935900
6,chr10,39936000,41497440
7,chr10,41497540,41545720
8,chr10,41545820,41593521
9,chr11,51078348,51090317


In [5]:
data_without_chr = data_without_cols.copy()

data_without_chr.iloc[:, 0] = data_without_chr.iloc[:, 0].apply(lambda x: x.replace('chr', ''))

data_without_chr

Unnamed: 0,1,2,3
0,1,122026459,122224535
1,1,122224635,122503147
2,1,122503247,124785432
3,1,124785532,124849129
4,1,124849229,124932724
5,10,39686682,39935900
6,10,39936000,41497440
7,10,41497540,41545720
8,10,41545820,41593521
9,11,51078348,51090317


In [6]:
data_with_col_names = data_without_chr.copy()

data_with_col_names.rename(columns = { 1: 'Chromosome', 2: 'Start', 3: 'End' }, inplace = True)

data_with_col_names

Unnamed: 0,Chromosome,Start,End
0,1,122026459,122224535
1,1,122224635,122503147
2,1,122503247,124785432
3,1,124785532,124849129
4,1,124849229,124932724
5,10,39686682,39935900
6,10,39936000,41497440
7,10,41497540,41545720
8,10,41545820,41593521
9,11,51078348,51090317


In [7]:
chromosome_names = [str(_chr) for _chr in range(1, 23)]
chromosome_names.extend(['X', 'Y'])

chromosome_names

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 'X',
 'Y']

In [8]:
centromeres_data = data_with_col_names.copy()

for name in chromosome_names:
    chr_data = centromeres_data[centromeres_data['Chromosome'] == name]
    if len(chr_data.index) > 1:
        start = chr_data['Start'].min()
        end = chr_data['End'].max()
        centromeres_data = centromeres_data[centromeres_data['Chromosome'] != name]
        new_row = pd.DataFrame({
            'Chromosome': [ name ],
            'Start': [ start ],
            'End': [ end ],
        })
        centromeres_data = centromeres_data.append(new_row)
        
centromeres_data = centromeres_data.reset_index(drop=True)

centromeres_data

Unnamed: 0,Chromosome,Start,End
0,2,92188145,94090557
1,4,49712061,51743951
2,6,58553888,59829934
3,8,44033744,45877265
4,9,43389635,45518558
5,X,58605579,62412542
6,Y,10316944,10544039
7,1,122026459,124932724
8,3,90772458,93655574
9,5,46485900,50059807


In [9]:
# import my modules for reading dataframes from folder /modules

import sys

sys.path.insert(0, '../modules/')
    
from tumor_data_processor_with_sex_chr import * 

In [10]:
pd.set_option('display.max_rows', 20)

In [11]:
cn_data = pd.read_csv('../datasets/P6.Inform/I062.022.WGS.Tumor_events.txt', sep='\t', comment='#')

cn_data

Unnamed: 0,Chromosome Region,Event,Length,Cytoband,% of CNV Overlap,Probe Median,% Heterozygous,Probes,Count of Gene Symbols
0,"chr1:862,439-1,313,714",CN Gain,451276,p36.33,100.000000,0.254980,,15,35
1,"chr1:2,323,340-2,500,322",CN Gain,176983,p36.32,98.491937,0.372007,,6,8
2,"chr1:9,840,751-10,297,200",CN Loss,456450,p36.22,4.657695,-0.247301,,14,8
3,"chr1:28,313,363-29,454,028",CN Loss,1140666,p35.3,5.721575,-0.252082,,34,25
4,"chr1:32,265,271-32,656,728",CN Loss,391458,p35.2 - p35.1,6.300564,-0.241056,,12,7
...,...,...,...,...,...,...,...,...,...
374,"chrX:153,411,122-153,945,270",High Copy Gain,534149,q28,100.000000,0.746978,,16,37
375,"chrX:153,945,270-154,722,592",CN Gain,777323,q28,100.000000,0.346073,,24,47
376,"chrX:154,722,592-155,270,560",CN Loss,547969,q28,89.647206,-0.355816,,15,8
377,"chrY:3,095,792-7,022,409",CN Loss,3926618,p11.2,10.072309,-0.745243,,12,19


In [12]:
test_data = process_tumor_data_with_sex(cn_data)

test_data

Unnamed: 0,Chromosome,Copy Number,Length,Start,End
0,1,3,451275,862439,1313714
1,1,3,176982,2323340,2500322
2,1,1,456449,9840751,10297200
3,1,1,1140665,28313363,29454028
4,1,1,391457,32265271,32656728
...,...,...,...,...,...
365,X,4,534148,153411122,153945270
366,X,3,777322,153945270,154722592
367,X,1,547968,154722592,155270560
368,Y,1,3926617,3095792,7022409


In [13]:
for name in chromosome_names:
    chr_data = test_data[test_data['Chromosome'] == name]
    chr_centromere = centromeres_data.loc[centromeres_data['Chromosome'] == name].iloc[0]
    start = chr_centromere['Start']
    end = chr_centromere['End']
    centromere_segments = chr_data[(chr_data['Start'] >= start) & (chr_data['Start'] < end) | (chr_data['End'] > start) & (chr_data['End'] <= end)]
    
    if not centromere_segments.empty:
        print(centromere_segments, '\n')
        print(chr_centromere)
        print('\n\n')


   Chromosome  Copy Number   Length     Start       End
43          3            3  3757201  87242799  91000000
44          3            3  2734987  93610543  96345530 

Chromosome           3
Start         90772458
End           93655574
Name: 8, dtype: object



    Chromosome  Copy Number  Length     Start       End
192          9            1  849883  43145357  43995240 

Chromosome           9
Start         43389635
End           45518558
Name: 4, dtype: object



    Chromosome  Copy Number  Length     Start       End
219         11            1  881826  50316891  51198717 

Chromosome          11
Start         51078348
End           54425074
Name: 12, dtype: object



    Chromosome  Copy Number    Length     Start       End
308         18            3  19807933  19790758  39598691 

Chromosome          18
Start         15460899
End           20861206
Name: 19, dtype: object



    Chromosome  Copy Number  Length     Start       End
321         19            1  247221  24282188 