In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
# import my modules for reading dataframes from folder /modules

import sys

sys.path.insert(0, 'modules/')
    
from tumor_data_processor import * 
from tumor_data_processor_2 import * 
from lengths_data_processor import *

In [3]:
data = pd.read_csv('datasets/P6.Inform/I062.022.WGS.Tumor_events.txt', sep='\t', comment='#')

data

Unnamed: 0,Chromosome Region,Event,Length,Cytoband,% of CNV Overlap,Probe Median,% Heterozygous,Probes,Count of Gene Symbols
0,"chr1:862,439-1,313,714",CN Gain,451276,p36.33,100.000000,0.254980,,15,35
1,"chr1:2,323,340-2,500,322",CN Gain,176983,p36.32,98.491937,0.372007,,6,8
2,"chr1:9,840,751-10,297,200",CN Loss,456450,p36.22,4.657695,-0.247301,,14,8
3,"chr1:28,313,363-29,454,028",CN Loss,1140666,p35.3,5.721575,-0.252082,,34,25
4,"chr1:32,265,271-32,656,728",CN Loss,391458,p35.2 - p35.1,6.300564,-0.241056,,12,7
...,...,...,...,...,...,...,...,...,...
374,"chrX:153,411,122-153,945,270",High Copy Gain,534149,q28,100.000000,0.746978,,16,37
375,"chrX:153,945,270-154,722,592",CN Gain,777323,q28,100.000000,0.346073,,24,47
376,"chrX:154,722,592-155,270,560",CN Loss,547969,q28,89.647206,-0.355816,,15,8
377,"chrY:3,095,792-7,022,409",CN Loss,3926618,p11.2,10.072309,-0.745243,,12,19


In [4]:
test_data = process_tumor_data(data)

test_data

Unnamed: 0,Chromosome,Copy Number,Length,Start,End
0,1,3,451275,862439,1313714
1,1,3,176982,2323340,2500322
2,1,1,456449,9840751,10297200
3,1,1,1140665,28313363,29454028
4,1,1,391457,32265271,32656728
...,...,...,...,...,...
338,22,1,539799,28754409,29294208
339,22,1,423407,31741627,32165034
340,22,3,459941,37448717,37908658
341,22,1,1851735,40438665,42290400


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343 entries, 0 to 342
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Chromosome   343 non-null    object
 1   Copy Number  343 non-null    int64 
 2   Length       343 non-null    int64 
 3   Start        343 non-null    int64 
 4   End          343 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 13.5+ KB


In [6]:
lengths = pd.read_csv('datasets/hs37d5.fa.fai', sep='\t', header=None)

In [7]:
lengths = process_lengths_data(lengths)

# data is in good format
lengths

Unnamed: 0_level_0,Length
Chromosome,Unnamed: 1_level_1
1,249250621
2,243199373
3,198022430
4,191154276
5,180915260
6,171115067
7,159138663
8,146364022
9,141213431
10,135534747


In [8]:
Mb = 1000000
S_SMALL = 3 * Mb

In [47]:
def insert_row(df, _chr, length, start, end, index):
    normal_segment = pd.DataFrame({
        'Chromosome': [ _chr ],
        'Copy Number': [ 2 ],
        'Length': [length],
        'Start': [ start ],
        'End': [ end ]
    })
    
    print(normal_segment)
            
    return pd.concat([df.iloc[:index], normal_segment, df.iloc[index:]]).reset_index(drop=True)


def fill_segments(data):
    df = data.copy()
    index_df = 0

    for index, row in test_data.iterrows():

        # first cnv region in chromosome
        if index == 0 or test_data.loc[index-1, 'Chromosome'] != test_data.loc[index, 'Chromosome']:
            if row['Start'] != 0:
                df = insert_row(df, row['Chromosome'], row['Start'], 0, row['Start'], index_df)
                index_df += 1

        # not first cnv region in chromosome 
        elif test_data.loc[index-1, 'End'] != test_data.loc[index, 'Start']:
            prev = test_data.loc[index-1]

            df = insert_row(df, row['Chromosome'], row['Start'] - prev['End'], prev['End'], row['Start'], index_df)
            index_df += 1

        # last cnv region in chromosome
        if index == len(test_data) - 1 or test_data.loc[index+1, 'Chromosome'] != test_data.loc[index, 'Chromosome']:

            chr_len = lengths.loc[row['Chromosome'], 'Length']
            if row['End'] != chr_len:      
                df = insert_row(df, row['Chromosome'], chr_len - row['End'], row['End'], chr_len, index_df)
                index_df += 1

        index_df += 1

    return df

In [10]:
pd.set_option('display.max_rows', 1000)

In [48]:
filled_data = fill_segments(test_data)

filled_data

  Chromosome  Copy Number  Length  Start     End
0          1            2  862439      0  862439
  Chromosome  Copy Number   Length    Start      End
0          1            2  1009626  1313714  2323340
  Chromosome  Copy Number   Length    Start      End
0          1            2  7340429  2500322  9840751
  Chromosome  Copy Number    Length     Start       End
0          1            2  18016163  10297200  28313363
  Chromosome  Copy Number   Length     Start       End
0          1            2  2811243  29454028  32265271
  Chromosome  Copy Number   Length     Start       End
0          1            2  6635744  32656728  39292472
  Chromosome  Copy Number  Length     Start       End
0          1            2  897788  39678475  40576263
  Chromosome  Copy Number   Length     Start       End
0          1            2  4723220  41104362  45827582
  Chromosome  Copy Number     Length     Start        End
0          1            2  103509017  46439640  149948657
  Chromosome  Copy Numbe

Unnamed: 0,Chromosome,Copy Number,Length,Start,End
0,1,2,862439,0,862439
1,1,3,451275,862439,1313714
2,1,2,1009626,1313714,2323340
3,1,3,176982,2323340,2500322
4,1,2,7340429,2500322,9840751
5,1,1,456449,9840751,10297200
6,1,2,18016163,10297200,28313363
7,1,1,1140665,28313363,29454028
8,1,2,2811243,29454028,32265271
9,1,1,391457,32265271,32656728


In [44]:
df1 = df2 = filled_data.copy()

In [52]:
def link_segments(df, prev, _next):
    new_segment = pd.DataFrame({
        'index': [ prev['index'].values[0] ],
        'Chromosome': [ prev['Chromosome'].values[0] ],
        'Copy Number': [ prev['Copy Number'].values[0] ],
        'Length': [ _next['End'].values[0] - prev['Start'].values[0] ],
        'Start': [ prev['Start'].values[0] ],
        'End': [ _next['End'].values[0] ]
    })
            
    df = df.loc[ (df['index'] != prev['index'].values[0]) & (df['index'] != _next['index'].values[0]) ]
    df = df.append(new_segment, ignore_index=True)
    
    return df
    

df1 = df2 = filled_data.copy()

df2 = df2.sort_values(by=['Length']).reset_index()
while True:
    
    row = df2.loc[0]
    if row['Length'] < S_SMALL:
        prev = df2[ (df2['End'] == row['Start']) & (df2['Chromosome'] == row['Chromosome']) ]
        _next = df2[ (df2['Start'] == row['End']) & (df2['Chromosome'] == row['Chromosome']) ]
        
        if not prev.empty and not _next.empty and prev['Copy Number'].values[0] == _next['Copy Number'].values[0]:
            df2 = link_segments(df2, prev, _next)
        
        df2 = df2.drop(index=0)
        df2 = df2.sort_values(by=['Length']).reset_index(drop=True)
        
    else:
        break
        
df1 = df2.sort_values(by=['index']).reset_index(drop=True)
df1 = df1.drop(columns=['index'])

df1

Unnamed: 0,Chromosome,Copy Number,Length,Start,End
0,1,2,159064820,0,159064820
1,1,3,4098498,159064820,163163318
2,1,2,56332475,163163318,219495793
3,1,3,29754828,219495793,249250621
4,2,2,10805837,0,10805837
5,2,3,21411345,10805837,32217182
6,2,2,210982191,32217182,243199373
7,3,1,49286780,0,49286780
8,3,1,3391549,50714661,54106210
9,3,2,5846600,54106210,59952810


In [12]:
def coerce(data):
    df1 = df2 = data.copy()
    
    

def lst(data):
    coercing(data)