In [1]:
## Bring in needed mods
import numpy as np, pandas as pd, vcfpy, glob
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
## Set vcf paths (with wild card) and print number
vcfpath = '/home/croth/SELFFILAM/VCF/AE0173*.vcf.gz' 
vcfs = sorted(glob.glob(vcfpath))
len(vcfs)

14

In [3]:
## Show the saveing labels for the files
vcfsplit = '.vcf'
for i,vcffile in enumerate(vcfs):
    savechrom = vcffile.split(vcfsplit)[0
                    ].split('/')[-1]+'_%s'%(i+1)
    print(savechrom)

AE017341.1.B3502.progeny_1
AE017342.1.B3502.progeny_2
AE017343.1.B3502.progeny_3
AE017344.1.B3502.progeny_4
AE017345.1.B3502.progeny_5
AE017346.1.B3502.progeny_6
AE017347.1.B3502.progeny_7
AE017348.1.B3502.progeny_8
AE017349.1.B3502.progeny_9
AE017350.1.B3502.progeny_10
AE017351.1.B3502.progeny_11
AE017352.1.B3502.progeny_12
AE017353.1.B3502.progeny_13
AE017356.1.B3502.progeny_14


In [4]:
## interate thru each vcf and parse needed info
for i,vcffile in enumerate(vcfs):
    ## Set save ing name
    savechrom = vcffile.split(vcfsplit)[0].split('/')[-1]+'_%s'%(i+1)
        
    ## Load in variant records and sample names
    rdr = vcfpy.Reader.from_path(vcffile)
    samples = rdr.header.samples.names
    recs = [rec for rec in rdr]
    
    ## Gather info on recs
    chrom = [rec.CHROM for rec in recs]
    pos = [rec.POS for rec in recs]
    qual = [rec.QUAL for rec in recs]
    callrate = [rec.INFO['AN']/len(rec.calls) for rec in recs]
    ref = [rec.REF for rec in recs]
    alt = [rec.ALT for rec in recs]
    Nalt = [len(a) for a in alt]
    len_alt_max = [np.max([len(k.value) for k in a ]) 
                   for i,a in enumerate(alt)]
    len_alt_min = [np.max([len(k.value) for k in a ]) 
                   for i,a in enumerate(alt)]
    depth = [rec.INFO['DP'] for rec in recs]
    typegv = [rec.INFO['TYPE'][0] for rec in recs]
    alleles = ['.'.join([ref[i]]+[k.value for k in a ]) 
                               for i,a in enumerate(alt)]
    
    ## Gather infor per recorded
    DP = []
    AR = []
    GT = []
    for rec in recs:
        dp = []
        ar = []
        gt = []
        for s in rec:
            if s.data['GT'] is None:
                dp.append(np.nan)
                ar.append(np.nan)
                gt.append(np.nan)
            else:
                dp.append(np.sum(s.data['AD']))
                ar.append(int(s.data['AD'][int(s.data['GT'])])/(
                        np.sum(s.data['AD'])+1))
                gt.append(int(s.data['GT']))
                
        DP.append(dp)
        AR.append(ar)
        GT.append(gt)
        
    ## make dataframes
    dps = pd.DataFrame(DP,columns=samples)
    ars = pd.DataFrame(AR,columns=samples)
    gts = pd.DataFrame(GT,columns=samples)
    info = pd.DataFrame([chrom,pos,qual,callrate,Nalt,
                     alleles,len_alt_max,len_alt_min,
                     depth,typegv],
                    index=['Seqid','Pos','Qual','Callrate','Nallele',
                           'Alleles','Maxlen','Minlen','Depth','Type']).T
    
    ## Save dataframes
    dps.to_csv('../GENOTYPE/DP/%s_depths.csv.gz'%savechrom)
    ars.to_csv('../GENOTYPE/AF/%s_allele_ratios.csv.gz'%savechrom)
    gts.to_csv('../GENOTYPE/GT/%s_genotypes.csv.gz'%savechrom)
    info.to_csv('../GENOTYPE/INFO/%s_info_cols.csv.gz'%savechrom)

test = [recs[i] for i,n in enumerate(Nalt) if n>1]

for s in test[20]:
    if s.data['GT'] is not None:
        k = int(s.data['AD'][int(s.data['GT'])])/(np.sum(s.data['AD'])+1)
        m = np.max(s.data['AD'])/(np.sum(s.data['AD'])+1)
        print(k,m)