In [4]:
import numpy as np, pandas as pd, vcfpy
from matplotlib import pyplot as plt
%matplotlib inline

In [5]:
vcfpath = '/home/croth/Bt22xFtc555-1.vcf.gz'

In [6]:
rdr = vcfpy.Reader.from_path(vcfpath)

In [54]:
samples = rdr.header.samples.names

In [56]:
header = ['CHROM','POS','ID','REF','ALT','QUAL',
          'FILTER','INFO','FORMAT'] + samples

In [24]:
recs = [r for r in rdr]

In [25]:
len(recs)

304888

In [61]:
## Gather info on recs
chrom = [rec.CHROM for rec in recs]
pos = [rec.POS for rec in recs]
qual = [rec.QUAL for rec in recs]
callrate = [rec.INFO['AN']/len(rec.calls) for rec in recs]
ref = [rec.REF for rec in recs]
alt = [rec.ALT for rec in recs]
len_alt = [len(a) for a in alt]
DP = [rec.INFO['DP'] for rec in recs]
typegv = [rec.INFO['TYPE'][0] for rec in recs]

In [49]:
#dps = pd.DataFrame([[s.data['DP'] if s.data['GT']!=None else 0 for s in r] 
#                    for r in recs],columns=rdr.header.samples.names)

In [50]:
DP = []
AR = []
GT = []
for rec in recs:
    dp = []
    ar = []
    gt = []
    
    for s in rec:
        
        if s.data['GT'] is None:
            dp.append(np.nan)
            ar.append(np.nan)
            gt.append(np.nan)
            
        else:
            dp.append(np.sum(s.data['AD']))
            ar.append(int(s.data['AD'][int(s.data['GT'])])/(
                        np.sum(s.data['AD'])+1))
            gt.append(int(s.data['GT']))
            
    DP.append(dp)
    AR.append(ar)
    GT.append(gt)

In [57]:
dps = pd.DataFrame(DP,columns=samples)

In [59]:
dps.head()

Unnamed: 0,PMY2650,PMY2649
0,,3.0
1,,8.0
2,,8.0
3,,22.0
4,,19.0


In [None]:
alts = ['.'.join([c.value for c in a]) for a in alt]

In [70]:
dps['Chrom'] = chrom
dps['Pos'] = pos
dps['Ref'] = ref
dps['Alt'] = alts
dps['DP'] = DP

In [74]:
GTdf = pd.DataFrame(GT,columns=[s+'_GT' for s in samples])
GTdf.head()

Unnamed: 0,PMY2650_GT,PMY2649_GT
0,,1.0
1,,1.0
2,,1.0
3,,1.0
4,,1.0


In [76]:
for s in GTdf.columns:
    dps[s] = GTdf[s]

In [80]:
dps[(GTdf.columns)].dropna().shape

(293205, 2)

In [92]:
dps_c = dps.dropna()

In [96]:
diffs = dps_c[(dps_c[GTdf.columns[0]] != dps_c[GTdf.columns[1]])]

In [98]:
diffs.groupby('Chrom').count()

Unnamed: 0_level_0,PMY2650,PMY2649,Pos,Ref,Alt,DP,PMY2650_GT,PMY2649_GT
Chrom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CP003820.1,7503,7503,7503,7503,7503,7503,7503,7503
CP003821.1,5645,5645,5645,5645,5645,5645,5645,5645
CP003822.1,4295,4295,4295,4295,4295,4295,4295,4295
CP003823.1,4036,4036,4036,4036,4036,4036,4036,4036
CP003824.1,7642,7642,7642,7642,7642,7642,7642,7642
CP003825.1,5877,5877,5877,5877,5877,5877,5877,5877
CP003826.1,6001,6001,6001,6001,6001,6001,6001,6001
CP003827.1,4684,4684,4684,4684,4684,4684,4684,4684
CP003828.1,4821,4821,4821,4821,4821,4821,4821,4821
CP003829.1,4856,4856,4856,4856,4856,4856,4856,4856


In [97]:
diffs.shape

(69757, 9)

In [119]:
chrom1_bed = pd.read_csv('/home/croth/Documents/DirtyDozen/BED/CP003820.1.bed',
                         sep='\t',header=None)
chrom1_bed.columns = ['Chrom','Left','Right']
chrom1_bed.head()

Unnamed: 0,Chrom,Left,Right
0,CP003820.1,24687,24692
1,CP003820.1,24692,24695
2,CP003820.1,24698,24701
3,CP003820.1,25979,25982
4,CP003820.1,26118,26121


In [120]:
chrom1_bed.shape

(5727, 3)

In [121]:
chrom1_vp = np.concatenate([np.arange(c.Left,c.Right+1) for i,c in chrom1_bed.iterrows()])

In [122]:
len(chrom1_vp)

30465

In [123]:
chrom1_bed.shape

(5727, 3)

In [126]:
diffs[(diffs.Chrom=='CP003820.1') & diffs.Pos.isin(chrom1_vp) ]

Unnamed: 0,PMY2650,PMY2649,Chrom,Pos,Ref,Alt,DP,PMY2650_GT,PMY2649_GT
47,2.0,3.0,CP003820.1,24690,G,A,5,0.0,1.0
48,2.0,3.0,CP003820.1,24693,A,G,5,1.0,0.0
49,3.0,3.0,CP003820.1,24699,T,C,6,0.0,1.0
72,10.0,11.0,CP003820.1,25980,C,T,22,1.0,0.0
76,6.0,4.0,CP003820.1,26119,A,G,10,0.0,1.0
...,...,...,...,...,...,...,...,...,...
35586,6.0,7.0,CP003820.1,2253727,A,G,13,1.0,0.0
35715,7.0,11.0,CP003820.1,2260336,A,T,18,0.0,1.0
35801,8.0,7.0,CP003820.1,2263666,GGTAC,AGTAT.GGTAT,15,2.0,1.0
35807,11.0,9.0,CP003820.1,2264002,G,A,20,0.0,1.0


In [125]:
diffs[(diffs.Chrom=='CP003820.1')].shape

(7503, 9)

In [107]:
chrom1_bed.shape

(4519, 3)