In [1]:
import vcf

In [2]:
import pandas as pd

In [3]:
with open('../test/data/Challenge_data.vcf') as vcf_fh:
    vcf_reader = vcf.Reader(vcf_fh)
    samples = vcf_reader.samples
    df = pd.DataFrame(
        data=[[site.genotype(sample) for sample in samples] for site in vcf_reader],
        columns=samples)

In [4]:
def xtab(func):
    return pd.crosstab(**df.applymap(func).melt(var_name='columns', value_name='index'))

In [5]:
# check pass/filter
xtab(lambda gt: gt.site.FILTER is None)

columns,normal,vaf5
index,Unnamed: 1_level_1,Unnamed: 2_level_1
True,6977,6977


In [6]:
# check ploidy
xtab(lambda gt: gt.ploidity)

columns,normal,vaf5
index,Unnamed: 1_level_1,Unnamed: 2_level_1
3,6977,6977


In [7]:
# check genotypes
xtab(lambda gt: gt['GT'])

columns,normal,vaf5
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0/0/0,2119,2117
0/0/1,2642,2644
0/0/2,13,13
0/0/3,1,1
0/1/1,386,386
0/1/2,5,5
1/1/1,1788,1788
1/1/2,7,7
1/2/2,16,16


In [8]:
# check variant types
df.normal.apply(lambda gt: gt.site.INFO['TYPE']).value_counts()

[snp]                             5376
[del]                              704
[ins]                              309
[del, ins]                         308
[complex]                           92
[del, ins, snp]                     36
[mnp]                               35
[complex, snp]                      15
[del, del, ins]                     13
[ins, snp]                          13
[del, ins, ins]                     13
[del, del, ins, ins]                11
[del, snp]                          10
[del, del]                           6
[ins, ins]                           6
[del, del, ins, snp]                 3
[complex, complex]                   3
[mnp, snp]                           3
[del, ins, ins, snp]                 2
[complex, complex, snp]              2
[complex, complex, del, ins]         2
[del, del, ins, ins, ins]            2
[del, del, del]                      2
[del, ins, ins, snp, snp]            2
[del, snp, snp]                      1
[snp, snp]               

In [9]:
# check variant subtypes
df.normal.apply(lambda gt: gt.site.var_subtype).value_counts()

ts         3799
tv         1534
del         706
ins         477
unknown     461
Name: normal, dtype: int64

In [10]:
# check chrom distibution
df.normal.apply(lambda gt: gt.site.CHROM).value_counts(normalize=True)

1     0.097176
6     0.069227
17    0.060628
12    0.058621
2     0.058048
5     0.056185
7     0.055181
3     0.053175
11    0.052028
19    0.049162
8     0.041708
9     0.040705
16    0.040562
4     0.038412
14    0.038125
13    0.033252
10    0.031676
X     0.028522
22    0.022359
20    0.021069
15    0.020926
18    0.017199
21    0.016053
Name: normal, dtype: float64

In [11]:
# are genos always the same for normal and vaf5 ?
df.loc[~df.apply(lambda r: r.normal['GT'] == r.vaf5['GT'], axis=1)]

Unnamed: 0,normal,vaf5
3280,"Call(sample=normal, CallData(GT=0/0/0, GQ=160....","Call(sample=vaf5, CallData(GT=0/0/1, GQ=46.362..."
3281,"Call(sample=normal, CallData(GT=0/0/0, GQ=135....","Call(sample=vaf5, CallData(GT=0/0/1, GQ=-0.0, ..."


In [12]:
df.normal.loc[df.normal.apply(lambda gt: gt.site.is_snp)] \
    .apply(lambda gt: (gt.site.end - gt.site.start)) \
    .value_counts()

1    5334
Name: normal, dtype: int64