In [1]:
## Bring in needed mods
import numpy as np, pandas as pd, vcfpy as vcf, os

In [2]:
## Set variables 
## Change the target VCF_FILE to run on other chormosomes
VCF_FILE = '../DATA/GENOTYPE/CDX_illumina_vcf/DNX-Chr01-xl280genome-127.vcf.gz'
fspl = '.vcf.gz'
fear = '-ar-df.csv.gz'
fegt = '-gt-df.csv.gz'

## Set genotype and allelic read depth ratio save paths
gtsavepath = 'DFRAME/DNX'.join(VCF_FILE.split('DNX')
                              ).split(fspl)[0]+fegt

arsavepath = 'DFRAME/DNX'.join(VCF_FILE.split('DNX')
                              ).split(fspl)[0]+fear

In [3]:
## Use the vcf.Reader to bring the file into python
rdr = vcf.Reader.from_path(VCF_FILE)

## For the records in the VCF file
recs = np.array([rec for rec in rdr])
len(recs)

67298

In [20]:
## Gather info on recs
chrom = [rec.CHROM for rec in recs]
pos = [rec.POS for rec in recs]
qual = [rec.QUAL for rec in recs]
callrate = [rec.INFO['AN']/len(rec.calls) 
            for rec in recs]
ref = [rec.REF for rec in recs]
alt = [rec.ALT for rec in recs]
len_alt = [len(a) for a in alt]
dps = [rec.INFO['DP'] for rec in recs]
typegv = [rec.INFO['TYPE'][0] for rec in recs]

In [6]:
## Gather the allelic read depths
ads = [[seg.data[‘AD’] if seg.data[‘GT’]!=None else None
        for seg in rec] for rec in recs]

## Gather called genotypes
gts = [[seg.data[‘GT’] for seg in rec]
       for rec in recs]

In [None]:
## Gather sample names
sample_names = [seg.sample for seg in recs[0]]

In [21]:
## Make dataframe with info and allelic info
data_df = pd.DataFrame([chrom,pos,qual,callrate,ref,
                        alt,len_alt,dps,typegv]).T

## Set columns
data_df.columns = ['Chrom','Pos','Qual','Callrate',
                   'Ref','Altall','Altlen','Dp','Type']

data_df['Alt'] = ['.'.join([a.value for a in s]) 
                   for s in data_df.Altall]

data_df['Type'] = ['.'.join([a.type for a in s]) 
                   for s in data_df.Altall]

data_df.drop('Altall',axis=1,inplace=True)

data_df.head()

Unnamed: 0,Chrom,Pos,Qual,Callrate,Ref,Altlen,Dp,Type,Alt
0,Chr01,1,1532.58,0.299213,C,1,61,SNV,G
1,Chr01,6,7720.89,0.440945,C,1,273,SNV,T
2,Chr01,14,84027.3,0.527559,C,1,2905,SNV,T
3,Chr01,28,0.0,0.527559,A,1,3600,SNV,T
4,Chr01,152,33410.0,0.480315,ATAGCCGAATGGATGAGTCGAGTGTGACGGGCT,1,1486,MNV,CTAGCCTAGTGGATCAGTCGGGCGTGATGGGAT


In [8]:
## Make temp dataframe with allelic read depth info
tempads = pd.DataFrame(ads,columns=sample_names)
data_ad = pd.concat([data_df,tempads],axis=1)
data_ad.fillna(value=np.nan, inplace=True)

In [9]:
## Round the allelic ratio values
data_ad[sample_names]=data_ad[sample_names].apply(
                        np.round,args=[3],axis=1)

In [10]:
## Save alellic read depth dataframe
data_ad.to_csv(arsavepath)

In [11]:
## Make and edit dataframe with genotype data
tempgt = pd.DataFrame(gts,columns=sample_names)
tempgt.replace('.',np.nan,inplace=True)

In [12]:
## Find all the unique non-nan genotypes
unique_genotypes = np.unique(
                    np.concatenate(
                        tempgt.dropna().values))

In [13]:
## Replace them with float verzions
for ug in unique_genotypes:
    tempgt.replace(ug,
                   float(int(ug)),
                   inplace=True)

In [14]:
## Concat the genotype dataframe to the info df
data_gt = pd.concat([data_df,tempgt],axis=1)
data_gt.replace('.',np.nan,inplace=True)

In [15]:
## Check shape of data
data_gt.shape

(67298, 136)

In [16]:
## Save the genotype dataframe
data_gt.to_csv(gtsavepath)