In [1]:
#! /home/croth/anaconda3/bin/python

In [None]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("vcf",
                    type=str,
                    help="name of (or path) to VCF file.");
parser.add_argument("-fspl",type=str,
                    help="string to split VCF file name on for saving.",
                    default = '.vcf.gz' );
parser.add_argument("-fear",type=str,
                    help="file end to save allelic read depth dataframe.",
                    default = '-ar-df.csv');
parser.add_argument("-fegt",type=str,
                    help="file end to save genotype dataframe.",
                    default = '-gt-df.csv');
args = parser.parse_args();

In [None]:
## Assign variables
VCF_FILE = args.vcf
fspl = args.fspl
fear = args.fear
fegt = args.fegt

In [1]:
## Bring in needed mods
import numpy as np, pandas as pd, vcfpy as vcf, os
## Set variables 
### These variables are listed here as test cases. 
VCF_FILE = '../../CDX_illumina_vcf/DNX-Chr14-xl280genome-127.vcf.gz'
fspl = '.vcf.gz'
fear = '-ar-df.csv'
fegt = '-gt-df.csv'

In [2]:
## Use the vcf.Reader to bring the file into python
rdr = vcf.Reader.from_path(VCF_FILE)

In [3]:
## For the records in the VCF file
recs = np.array([rec for rec in rdr])

In [4]:
len(recs)

25665

In [5]:
## Gather info on recs
chrom = [rec.CHROM for rec in recs]
pos = [rec.POS for rec in recs]
qual = [rec.QUAL for rec in recs]
callrate = [rec.INFO['AN']/len(rec.calls) for rec in recs]
ref = [rec.REF for rec in recs]
alt = [rec.ALT for rec in recs]
len_alt = [len(a) for a in alt]
dp = [rec.INFO['DP'] for rec in recs]
typegv = [rec.INFO['TYPE'][0] for rec in recs]

In [9]:
DP = []
AR = []
GT = []
for rec in recs:
    dp = []
    ar = []
    gt = []
    
    for s in rec:
        
        if s.data['GT'] is None:
            dp.append(np.nan)
            ar.append(np.nan)
            gt.append(np.nan)
            
        else:
            dp.append(np.sum(s.data['AD']))
            ar.append(int(s.data['AD'][int(s.data['GT'])])/(
                        np.sum(s.data['AD'])+1))
            gt.append(int(s.data['GT']))
            
    DP.append(dp)
    AR.append(ar)
    GT.append(gt)

In [10]:
## Gather the allelic read depths
ads = AR#[[seg.data.AD for seg in rec] for rec in recs]

In [11]:
## Gather called genotypes
gts = GT#[[seg.data.GT for seg in rec] for rec in recs]

In [12]:
## Gather sample names
sample_names = [seg.sample for seg in rec]

In [13]:
## Make dataframe with info and allelic info
data_df = pd.DataFrame([chrom,pos,qual,callrate,ref,
                        alt,len_alt,dp,typegv]).T
data_df.columns = ['Chrom','Pos','Qual','Callrate',
                   'Ref','Alt','Altlen','Dp','Type']

In [14]:
## Make temp dataframe with allelic read depth info
tempads = pd.DataFrame(ads,columns=sample_names)
data_ad = pd.concat([data_df,tempads],axis=1)
data_ad.fillna(value=np.nan, inplace=True)

In [15]:
## Save alellic read depth dataframe
data_ad.to_csv(VCF_FILE.split(fspl)[0] + fear)

In [16]:
## Make and edit dataframe with genotype data
tempgt = pd.DataFrame(gts,columns=sample_names)
tempgt.replace('.',np.nan,inplace=True)

In [17]:
## Find all the unique non-nan genotypes
unique_genotypes = np.unique(np.concatenate(tempgt.dropna().values))

In [18]:
## Replace them with float verzions
for ug in unique_genotypes:
    tempgt.replace(ug,float(int(ug)),inplace=True)

In [19]:
## Concat the genotype dataframe to the info df
data_gt = pd.concat([data_df,tempgt],axis=1)
data_gt.replace('.',np.nan,inplace=True)

In [20]:
## Save the genotype dataframe
data_gt.to_csv(VCF_FILE.split(fspl)[0] + fegt)