In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [2]:
# import my modules for reading dataframes from folder /modules

import sys

sys.path.insert(0, 'modules/')

from segments_data_processor import SegmentsDataProcessor, SegmentsDataProcessor2
from gap_data_processor import GapDataProcessor
from lengths_data_processor import process_lengths_data

In [3]:
sdp = SegmentsDataProcessor('datasets/P6.Inform/I062.015.WGS.Tumor_events.txt')

test_data = sdp.get_ai_segments()

test_data

Unnamed: 0,Chromosome,Length,Start,End
0,1,16032714,142535839,158568553
1,6,996425,57782542,58778967
2,9,502978,68070657,68573635
3,16,26342791,46386883,72729674
4,17,21747483,25265098,47012581
5,21,1454448,9658127,11112575


In [4]:
gdp = GapDataProcessor('datasets/gap.txt')

telomeres = gdp.get_telomeres()

telomeres

Unnamed: 0,Chromosome,Start,End
0,1,0,10000
1,1,249240621,249250621
2,2,0,10000
3,2,243189373,243199373
4,3,0,10000
5,3,198012430,198022430
6,4,0,10000
7,4,191144276,191154276
8,5,0,10000
9,5,180905260,180915260


In [5]:
chromosome_names = [str(_chr) for _chr in range(1, 23)]
chromosome_names.append('X')

### TAI with 10K telomeres

In [6]:
def tai(data):
    ntai = 0
    for _chr in chromosome_names:
        chr_telomeres = telomeres.loc[telomeres['Chromosome'] == _chr]
        chr_data = data.loc[data['Chromosome'] == _chr]

        for index, row in chr_telomeres.iterrows():
            telomere_start = row['Start']
            telomere_end = row['End']

            tais = chr_data.loc[(chr_data['Start'] >= telomere_start) & (chr_data['Start'] <= telomere_end) | (chr_data['End'] >= telomere_start) & (chr_data['End'] <= telomere_end)]
            ntai += len(tais.index)
            
    return ntai

In [7]:
ntai = tai(test_data)

ntai

0

In [8]:
dirpath = 'datasets/P6.Inform'

samples = [
    {
        'Filename': 'I062.007.WGS.Pre_events.txt',
    },
    {
        'Filename': 'I062.007.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.015.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.022.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.033.WGS.Tumor_events.txt',
    }
]

In [9]:
for i in range(len(samples)):
    sample = samples[i]
    relative_path = dirpath + '/' + sample['Filename']
    
    sdp = None
    if i == len(samples) - 1:
        sdp = SegmentsDataProcessor2(relative_path, 'P6.Rec7')
        
    else:
        sdp = SegmentsDataProcessor(relative_path)
        
    data = sdp.get_ai_segments()
    
    sample['NtAI'] = tai(data)

In [10]:
results_data = pd.DataFrame(samples)

results_data

Unnamed: 0,Filename,NtAI
0,I062.007.WGS.Pre_events.txt,0
1,I062.007.WGS.Tumor_events.txt,5
2,I062.015.WGS.Tumor_events.txt,0
3,I062.022.WGS.Tumor_events.txt,0
4,I062.033.WGS.Tumor_events.txt,1


### TAI with 2Mb telomeres

Parameters

In [11]:
Mb = 1000000
TELOMERE_SIZE = 2 * Mb

In [13]:
lengths = pd.read_csv('datasets/hs37d5.fa.fai', sep='\t', header=None)

In [14]:
lengths = process_lengths_data(lengths)

# data is in good format
lengths

Unnamed: 0_level_0,Length
Chromosome,Unnamed: 1_level_1
1,249250621
2,243199373
3,198022430
4,191154276
5,180915260
6,171115067
7,159138663
8,146364022
9,141213431
10,135534747


In [18]:
def tai2(data):
    ntai = 0
    for _chr in chromosome_names:
        chr_data = data.loc[data['Chromosome'] == _chr]
        chr_len = lengths.loc[_chr, 'Length']

        tais_start = chr_data.loc[chr_data['Start'] < TELOMERE_SIZE]
        tais_end = chr_data.loc[chr_data['End'] > chr_len - TELOMERE_SIZE]
        ntai += len(tais_start.index) + len(tais_end.index)

    return ntai

In [19]:
ntai2 = tai2(test_data)

ntai2

0

In [20]:
dirpath = 'datasets/P6.Inform'

samples2 = [
    {
        'Filename': 'I062.007.WGS.Pre_events.txt',
    },
    {
        'Filename': 'I062.007.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.015.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.022.WGS.Tumor_events.txt',
    },
    {
        'Filename': 'I062.033.WGS.Tumor_events.txt',
    }
]

In [21]:
for i in range(len(samples2)):
    sample = samples2[i]
    relative_path = dirpath + '/' + sample['Filename']
    
    sdp = None
    if i == len(samples) - 1:
        sdp = SegmentsDataProcessor2(relative_path, 'P6.Rec7')
        
    else:
        sdp = SegmentsDataProcessor(relative_path)
        
    data = sdp.get_ai_segments()
    
    sample['NtAI'] = tai2(data)

In [22]:
results_data2 = pd.DataFrame(samples2)

results_data2

Unnamed: 0,Filename,NtAI
0,I062.007.WGS.Pre_events.txt,0
1,I062.007.WGS.Tumor_events.txt,5
2,I062.015.WGS.Tumor_events.txt,0
3,I062.022.WGS.Tumor_events.txt,1
4,I062.033.WGS.Tumor_events.txt,1
