In [1]:
import pandas as pd
import numpy as np
import math
import sys

In [2]:
# load files
FILE_svk = 'cnvs_SVK_hg38.tsv'
FILE_cz = 'cnvs_CZ_hg38.tsv'
FILE_hu = 'cnvs_HU_hg38.tsv'

tab_svk_all  = pd.read_csv(FILE_svk, sep = '\t')
tab_cz_all  = pd.read_csv(FILE_cz, sep = '\t')
tab_hu_all  = pd.read_csv(FILE_hu, sep = '\t')

In [3]:
# filter only NEXTSEQ machine
tab_svk = tab_svk_all[tab_svk_all.loc[:,'machine']=='NEXTSEQ']
tab_cz = tab_cz_all[tab_cz_all.loc[:,'machine']=='NEXTSEQ']
tab_hu = tab_hu_all[tab_hu_all.loc[:,'machine']=='NEXTSEQ']

In [4]:
# Nextseq CNV rows
'SVK: '+ str(tab_svk.shape[0]), 'CZ: '+ str(tab_cz.shape[0]), 'HU: '+ str(tab_hu.shape[0])

('SVK: 3585', 'CZ: 622', 'HU: 855')

In [5]:
# Nextseq unique samples
'SVK: '+ str(len(tab_svk['sampleId'].unique())), 'CZ: '+ str(len(tab_cz['sampleId'].unique())), 'HU: '+ str(len(tab_hu['sampleId'].unique()))

('SVK: 2900', 'CZ: 510', 'HU: 632')

In [6]:
# Number of CNV per sample
print('SVK: \n', tab_svk['sampleId'].value_counts().value_counts().sort_index(),'\n'
      'CZ: \n', tab_cz['sampleId'].value_counts().value_counts().sort_index(), '\n'
      'HU: \n', tab_hu['sampleId'].value_counts().value_counts().sort_index())

SVK: 
 count
1     2332
2      490
3       70
4        5
5        1
6        1
32       1
Name: count, dtype: int64 
CZ: 
 count
1    409
2     91
3      9
4      1
Name: count, dtype: int64 
HU: 
 count
1     475
2     123
3      26
4       2
6       2
7       1
9       1
10      2
Name: count, dtype: int64


In [7]:
# Nuber of deletion per chromosome
print('SVK: \n', tab_svk[tab_svk['level']<0]['chromosome'].value_counts().sort_index(), '\n'
      'CZ: \n', tab_cz[tab_cz['level']<0]['chromosome'].value_counts().sort_index(), '\n'
      'HU: \n', tab_hu[tab_hu['level']<0]['chromosome'].value_counts().sort_index())

SVK: 
 chromosome
0      49
1      71
2      71
3      56
4      58
5      43
6     101
7      49
8      60
9      48
10     21
11     42
12     32
13     21
14     38
15     18
16     44
17     22
18      4
19     21
20     10
21     23
22    105
Name: count, dtype: int64 
CZ: 
 chromosome
0     10
1      4
2     15
3      9
4     10
5      6
6     23
7     10
8     15
9      9
10     4
11     3
12     5
13     1
14     7
15     3
16     7
17     1
19     1
21     1
22    18
Name: count, dtype: int64 
HU: 
 chromosome
0     15
1     19
2     20
3     19
4     11
5     13
6     30
7     15
8     14
9     10
10     5
11    14
12     9
13     4
14     7
15     4
16     8
17     1
18     1
19     5
21     4
22    16
Name: count, dtype: int64


In [8]:
# Nuber of amplifications per chromosome
print('SVK: \n', tab_svk[tab_svk['level']>0]['chromosome'].value_counts().sort_index(), '\n'
      'CZ: \n', tab_cz[tab_cz['level']>0]['chromosome'].value_counts().sort_index(), '\n'
      'HU: \n', tab_hu[tab_hu['level']>0]['chromosome'].value_counts().sort_index())

SVK: 
 chromosome
0     143
1     134
2     123
3     101
4      88
5     300
6     147
7     194
8      72
9      65
10    146
11    120
12     47
13     62
14    143
15     56
16     51
17     28
18     44
19     34
20     14
21    172
22    294
Name: count, dtype: int64 
CZ: 
 chromosome
0     35
1     36
2     24
3     12
4     20
5     49
6     22
7     24
8      7
9      8
10    35
11    17
12     2
13     8
14    29
15    13
16    14
17     9
18    10
19     7
20     3
21    31
22    45
Name: count, dtype: int64 
HU: 
 chromosome
0     42
1     28
2     26
3     23
4     28
5     66
6     35
7     51
8     12
9     16
10    45
11    42
12     2
13    10
14    22
15    14
16    12
17     8
18    17
19     9
20     2
21    25
22    76
Name: count, dtype: int64


In [9]:
# filter amplifications/deletions
sk_amp = tab_svk[tab_svk['level']>0][['chromosome','start_dist','end','length']].groupby(['chromosome','start_dist','end','length']).size().reset_index()
cz_amp = tab_cz[tab_cz['level']>0][['chromosome','start_dist','end','length']].groupby(['chromosome','start_dist','end','length']).size().reset_index()
hu_amp = tab_hu[tab_hu['level']>0][['chromosome','start_dist','end','length']].groupby(['chromosome','start_dist','end','length']).size().reset_index()

sk_del = tab_svk[tab_svk['level']<0][['chromosome','start_dist','end','length']].groupby(['chromosome','start_dist','end','length']).size().reset_index()
cz_del = tab_cz[tab_cz['level']<0][['chromosome','start_dist','end','length']].groupby(['chromosome','start_dist','end','length']).size().reset_index()
hu_del = tab_hu[tab_hu['level']<0][['chromosome','start_dist','end','length']].groupby(['chromosome','start_dist','end','length']).size().reset_index()

In [10]:
sk_amp

Unnamed: 0,chromosome,start_dist,end,length,0
0,0,820000,1140000,16,3
1,0,1160000,2020000,43,1
2,0,2480000,2900000,21,1
3,0,2480000,2920000,22,1
4,0,2500000,2900000,20,1
...,...,...,...,...,...
1633,22,152560000,152780000,11,1
1634,22,152980000,153400000,21,1
1635,22,153900000,154480000,29,1
1636,22,154120000,154540000,21,1
