In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# import my modules for reading dataframes from folder /modules

import sys

sys.path.insert(0, 'modules/')
    
from tumor_data_processor import * 
from tumor_data_processor_2 import * 
from lengths_data_processor import *

In [3]:
data = pd.read_csv('datasets/P6.Inform/I062.022.WGS.Tumor_events.txt', sep='\t', comment='#')

data

Unnamed: 0,Chromosome Region,Event,Length,Cytoband,% of CNV Overlap,Probe Median,% Heterozygous,Probes,Count of Gene Symbols
0,"chr1:0-648,442",CN Gain,648443,p36.33,98.457688,0.378064,,13,25
1,"chr1:142,535,839-158,568,553",Allelic Imbalance,16032715,q12 - q23.1,54.242139,0.023077,100.0,10,425
2,"chr1:158,568,553-163,226,118",CN Gain,4657566,q23.1 - q23.3,36.627208,0.287899,,148,112
3,"chr1:219,528,940-249,250,621",CN Gain,29721682,q41 - q44,32.777867,0.251258,,902,302
4,"chr2:10,768,725-32,389,147",CN Gain,21620423,p25.1 - p22.3,24.545594,0.419571,,659,180
...,...,...,...,...,...,...,...,...,...
290,"chrX:153,281,693-153,945,270",High Copy Gain,663578,q28,100.000000,0.900531,,20,40
291,"chrX:153,945,270-154,722,592",CN Gain,777323,q28,100.000000,0.407783,,24,47
292,"chrX:154,722,592-155,270,560",CN Loss,547969,q28,89.647206,-0.414165,,15,8
293,"chrY:3,095,792-7,022,409",CN Loss,3926618,p11.2,10.072309,-0.692149,,12,19


In [4]:
test_data = process_tumor_data(data)

test_data

Unnamed: 0,Chromosome,Copy Number,Length,Start,End
0,1,3,648442,0,648442
1,1,3,4657565,158568553,163226118
2,1,3,29721681,219528940,249250621
3,2,3,21620422,10768725,32389147
4,2,4,79537,33069397,33148934
...,...,...,...,...,...
245,21,3,809508,26800619,27610127
246,21,3,895173,28063208,28958381
247,22,3,8619233,16156754,24775987
248,22,3,1778208,36881774,38659982


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Chromosome   250 non-null    object
 1   Copy Number  250 non-null    int64 
 2   Length       250 non-null    int64 
 3   Start        250 non-null    int64 
 4   End          250 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 9.9+ KB


In [6]:
lengths = pd.read_csv('datasets/hs37d5.fa.fai', sep='\t', header=None)

In [7]:
lengths = process_lengths_data(lengths)

# data is in good format
lengths

Unnamed: 0_level_0,Length
Chromosome,Unnamed: 1_level_1
1,249250621
2,243199373
3,198022430
4,191154276
5,180915260
6,171115067
7,159138663
8,146364022
9,141213431
10,135534747


In [8]:
Mb = 1000000
S_small = 3 * Mb
LST_SMb = 10 * Mb

In [9]:
def insert_row(df, _chr, cn, length, start, end, index):
    normal_segment = pd.DataFrame({
        'Chromosome': [ _chr ],
        'Copy Number': [ cn ],
        'Length': [length],
        'Start': [ start ],
        'End': [ end ]
    })
                
    return pd.concat([df.iloc[:index], normal_segment, df.iloc[index:]]).reset_index(drop=True)


def fill_segments(data):
    df = data.copy()
    index_df = 0

    for index, row in data.iterrows():

        # first cnv region in chromosome
        if index == 0 or data.loc[index-1, 'Chromosome'] != data.loc[index, 'Chromosome']:
            if row['Start'] != 0:
                df = insert_row(df, row['Chromosome'], 2, row['Start'], 0, row['Start'], index_df)
                index_df += 1

        # not first cnv region in chromosome 
        elif data.loc[index-1, 'End'] != data.loc[index, 'Start']:
            prev = data.loc[index-1]

            df = insert_row(df, row['Chromosome'], 2, row['Start'] - prev['End'], prev['End'], row['Start'], index_df)
            index_df += 1

        # last cnv region in chromosome
        if index == len(data) - 1 or data.loc[index+1, 'Chromosome'] != data.loc[index, 'Chromosome']:

            chr_len = lengths.loc[row['Chromosome'], 'Length']
            if row['End'] != chr_len:      
                df = insert_row(df, row['Chromosome'], 2, chr_len - row['End'], row['End'], chr_len, index_df+1)
                index_df += 1

        index_df += 1

    return df

In [10]:
pd.set_option('display.max_rows', 1000)

In [11]:
filled_data = fill_segments(test_data)

filled_data

Unnamed: 0,Chromosome,Copy Number,Length,Start,End
0,1,3,648442,0,648442
1,1,2,157920111,648442,158568553
2,1,3,4657565,158568553,163226118
3,1,2,56302822,163226118,219528940
4,1,3,29721681,219528940,249250621
5,2,2,10768725,0,10768725
6,2,3,21620422,10768725,32389147
7,2,2,680250,32389147,33069397
8,2,4,79537,33069397,33148934
9,2,3,1388890,33148934,34537824


In [12]:
def link_segments(df, prev, _next):
    df = df.drop(index=prev.name)
    df = df.drop(index=_next.name)
    df = insert_row(df, prev['Chromosome'], prev['Copy Number'],  _next['End'] - prev['Start'],  prev['Start'], _next['End'], prev.name)
    
    return df
    

def coercing(data):
    df2 = data.copy()

    while True:

        # get smallest segment
        row = df2[df2['Length'] == df2['Length'].min()].iloc[0]
        index = row.name

        # filter out?
        if row['Length'] < S_small:

            # not first or last segment of profile?
            if index != 0 and index != len(df2) - 1:
                prev = df2.loc[ index-1 ]
                _next = df2.loc[ index+1 ]

                # can link?
                if prev['Chromosome'] == _next['Chromosome'] == row['Chromosome'] and prev['Copy Number'] == _next['Copy Number']:
                    df2 = link_segments(df2, prev, _next)

            # delete small segment
            df2 = df2.drop(index=index).reset_index(drop=True)

        # if there are no small segments left -> end
        else:
            return df2


In [13]:
coerce_data = coercing(filled_data)

In [14]:
lsts = 0

for index, row in coerce_data.iterrows():
    
    # not last segment in profile?
    if index != len(coerce_data) - 1:
        
        _next = coerce_data.loc[index+1]
        if row['Length'] >= LST_SMb and _next['Length'] >= LST_SMb and _next['Chromosome'] == row['Chromosome'] and _next['Start'] - row['End'] < S_small:
            lsts += 1
            
print(lsts)

33


In [15]:
def lst(data):
    coercing(data)