In [1]:
import pandas as pd

In [2]:
# import my modules for reading dataframes from folder /hrdtools

import sys

sys.path.insert(0, '../')

from hrdtools.segments_data_processor import SegmentsDataProcessor, SegmentsDataProcessor2
from hrdtools.gap_data_processor import GapDataProcessor
from hrdtools.lengths_data_processor import process_lengths_data

In [3]:
sdp = SegmentsDataProcessor('../data/P6.Inform/I062.007.WGS.Tumor_events.txt')

test_data = sdp.get_ai_segments()

test_data

Unnamed: 0,Chromosome,Length,Start,End
0,1,85008845,0,85008845
1,1,3344858,142535839,145880697
2,2,82886,91687599,91770485
3,4,45211705,0,45211705
4,4,4562276,45837724,50400000
5,4,4973876,186180400,191154276
6,5,442267,49405756,49848023
7,6,2408939,56370028,58778967
8,8,43538879,46930910,90469789
9,10,9541284,29449698,38990982


In [4]:
gdp = GapDataProcessor('../data/gap.txt')

telomeres = gdp.get_telomeres()

telomeres

Unnamed: 0,Chromosome,Start,End
0,1,0,10000
1,1,249240621,249250621
2,2,0,10000
3,2,243189373,243199373
4,3,0,10000
5,3,198012430,198022430
6,4,0,10000
7,4,191144276,191154276
8,5,0,10000
9,5,180905260,180915260


In [5]:
centromeres = gdp.get_centromeres()

centromeres

Unnamed: 0,Chromosome,Start,End
0,1,121535434,124535434
1,2,92326171,95326171
2,3,90504854,93504854
3,4,49660117,52660117
4,5,46405641,49405641
5,6,58830166,61830166
6,7,58054331,61054331
7,8,43838887,46838887
8,9,47367679,50367679
9,X,58632012,61632012


In [6]:
chromosome_names = [str(_chr) for _chr in range(1, 23)]
chromosome_names.append('X')

### TAI with 10K telomeres

In [7]:
def tai(data):
    ntai = 0
    for _chr in chromosome_names:
        chr_telomeres = telomeres.loc[telomeres['Chromosome'] == _chr]
        chr_data = data.loc[data['Chromosome'] == _chr]

        for index, row in chr_telomeres.iterrows():
            telomere_start = row['Start']
            telomere_end = row['End']

            tais = chr_data.loc[(chr_data['Start'] >= telomere_start) & (chr_data['Start'] <= telomere_end) | (chr_data['End'] >= telomere_start) & (chr_data['End'] <= telomere_end)]
            ntai += len(tais.index)
            
    return ntai

In [8]:
ntai = tai(test_data)

ntai

5

In [9]:
dirpath = '../data/P6.Inform'

samples = [
    {
        'Filename': 'I062.007.WGS.Pre_events.txt',
    },
    {
        'Filename': 'I062.007.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.015.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.022.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.033.WGS.Tumor_events.txt',
    }
]

In [10]:
for i in range(len(samples)):
    sample = samples[i]
    relative_path = dirpath + '/' + sample['Filename']
    
    sdp = None
    if i == len(samples) - 1:
        sdp = SegmentsDataProcessor2(relative_path, 'P6.Rec7')
        
    else:
        sdp = SegmentsDataProcessor(relative_path)
        
    data = sdp.get_ai_segments()
    
    sample['NtAI'] = tai(data)

In [11]:
results_data = pd.DataFrame(samples)

results_data

Unnamed: 0,Filename,NtAI
0,I062.007.WGS.Pre_events.txt,0
1,I062.007.WGS.Tumor_events.txt,5
2,I062.015.WGS.Tumor_events.txt,0
3,I062.022.WGS.Tumor_events.txt,0
4,I062.033.WGS.Tumor_events.txt,1


### TAI with 2Mb telomeres

Parameters

In [12]:
Mb = 1000000
TELOMERE_SIZE = 2 * Mb

In [13]:
lengths = pd.read_csv('../data/hs37d5.fa.fai', sep='\t', header=None)

In [14]:
lengths = process_lengths_data(lengths)

# data is in good format
lengths

Unnamed: 0_level_0,Length
Chromosome,Unnamed: 1_level_1
1,249250621
2,243199373
3,198022430
4,191154276
5,180915260
6,171115067
7,159138663
8,146364022
9,141213431
10,135534747


In [15]:
def tai2(data):
    ntai = 0
    for _chr in chromosome_names:
        chr_data = data.loc[data['Chromosome'] == _chr]
        chr_len = lengths.loc[_chr, 'Length']
        centromere_start = centromeres.loc[centromeres['Chromosome'] == _chr, 'Start'].iloc[0]
        centromere_end = centromeres.loc[centromeres['Chromosome'] == _chr, 'End'].iloc[0]
        
        tais_segments = chr_data.loc[
            (chr_data['Start'] < TELOMERE_SIZE) & (chr_data['End'] <= centromere_start) | 
            (chr_data['End'] > chr_len - TELOMERE_SIZE) & (chr_data['Start'] >= centromere_end)
        ]
        
        ntai += len(tais_segments.index)

    return ntai

In [16]:
ntai2 = tai2(test_data)

ntai2

5

In [17]:
dirpath = '../data/P6.Inform'

samples2 = [
    {
        'Filename': 'I062.007.WGS.Pre_events.txt',
    },
    {
        'Filename': 'I062.007.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.015.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.022.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.033.WGS.Tumor_events.txt',
    }
]

In [18]:
for i in range(len(samples2)):
    sample = samples2[i]
    relative_path = dirpath + '/' + sample['Filename']
    
    sdp = None
    if i == len(samples) - 1:
        sdp = SegmentsDataProcessor2(relative_path, 'P6.Rec7')
        
    else:
        sdp = SegmentsDataProcessor(relative_path)
        
    data = sdp.get_ai_segments()
    
    sample['NtAI'] = tai2(data)

In [19]:
results_data2 = pd.DataFrame(samples2)

results_data2

Unnamed: 0,Filename,NtAI
0,I062.007.WGS.Pre_events.txt,0
1,I062.007.WGS.Tumor_events.txt,5
2,I062.015.WGS.Tumor_events.txt,0
3,I062.022.WGS.Tumor_events.txt,1
4,I062.033.WGS.Tumor_events.txt,1
