In [1]:
## Bring in needed mods
import numpy as np, pandas as pd, vcfpy as vcf, os

In [2]:
## Set variables 
## Change the target VCF_FILE to run on other chormosomes
VCF_FILE = '../DATA/GENOTYPE/CDX_illumina_vcf/DNX-Chr14-xl280genome-127.vcf.gz'
fspl = '.vcf.gz'
fear = '-ar-df.csv.gz'
fegt = '-gt-df.csv.gz'

## Set genotype and allelic read depth ratio save paths
gtsavepath = 'DFRAME/DNX'.join(VCF_FILE.split('DNX')
                              ).split(fspl)[0]+fegt

arsavepath = 'DFRAME/DNX'.join(VCF_FILE.split('DNX')
                              ).split(fspl)[0]+fear

In [3]:
## Use the vcf.Reader to bring the file into python
rdr = vcf.Reader.from_path(VCF_FILE)

## For the records in the VCF file
recs = np.array([rec for rec in rdr])
len(recs)

25665

In [4]:
## Gather info on recs
chrom = [rec.CHROM for rec in recs]
pos = [rec.POS for rec in recs]
qual = [rec.QUAL for rec in recs]
callrate = [rec.INFO['AN']/len(rec.calls) for rec in recs]
ref = [rec.REF for rec in recs]
alt = [rec.ALT for rec in recs]
len_alt = [len(a) for a in alt]
dp = [rec.INFO['DP'] for rec in recs]
typegv = [rec.INFO['TYPE'][0] for rec in recs]

In [5]:
DP = []
AR = []
GT = []
for rec in recs:
    dp = []
    ar = []
    gt = []
    
    for s in rec:
        
        if s.data['GT'] is None:
            dp.append(np.nan)
            ar.append(np.nan)
            gt.append(np.nan)
            
        else:
            dp.append(np.sum(s.data['AD']))
            ar.append(int(s.data['AD'][int(s.data['GT'])])/(
                        np.sum(s.data['AD'])+1))
            gt.append(int(s.data['GT']))
            
    DP.append(dp)
    AR.append(ar)
    GT.append(gt)

In [6]:
## Gather the allelic read depths
ads = AR

## Gather called genotypes
gts = GT

## Gather sample names
sample_names = [seg.sample for seg in rec]

In [7]:
## Make dataframe with info and allelic info
data_df = pd.DataFrame([chrom,pos,qual,callrate,ref,
                        alt,len_alt,dp,typegv]).T

## Set columns
data_df.columns = ['Chrom','Pos','Qual','Callrate',
                   'Ref','Alt','Altlen','Dp','Type']

In [8]:
## Make temp dataframe with allelic read depth info
tempads = pd.DataFrame(ads,columns=sample_names)
data_ad = pd.concat([data_df,tempads],axis=1)
data_ad.fillna(value=np.nan, inplace=True)

In [9]:
## Save alellic read depth dataframe
data_ad.to_csv(arsavepath)

In [10]:
## Make and edit dataframe with genotype data
tempgt = pd.DataFrame(gts,columns=sample_names)
tempgt.replace('.',np.nan,inplace=True)

In [11]:
## Find all the unique non-nan genotypes
unique_genotypes = np.unique(np.concatenate(tempgt.dropna().values))

In [12]:
## Replace them with float verzions
for ug in unique_genotypes:
    tempgt.replace(ug,float(int(ug)),inplace=True)

In [13]:
## Concat the genotype dataframe to the info df
data_gt = pd.concat([data_df,tempgt],axis=1)
data_gt.replace('.',np.nan,inplace=True)

In [14]:
## Save the genotype dataframe
data_gt.to_csv(gtsavepath)