In [13]:
import os
import pandas as pd
import numpy as np
import gzip
import re


import matplotlib.pyplot as plt
import matplotlib.cm as cm


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [7]:
folder = '../data'

In [32]:
def import_VCF41_to_pandas(vcf_file, sep='\t'):
    """
    Script to read vcf 4.2
    - now handle correct allele frequency calculated by summing REF reads + ALT reads instead from DP parameter
    - now retrieve the largest read number for ALT allele frequency in case is a heterozygous SNP (depends on calculate_ALT_AD())
    - now uses dataframe.iterrows() instead dataframe.index
    - remove snps with two alternate alleles, keeping the most abundant if this is more at least 3 times more frequent
    """

    header_lines = 0
    if vcf_file.endswith(".gz"):
        compress = 'gzip'
        with gzip.open(vcf_file, 'rb') as f:
            first_line = f.readline().decode().strip()
            next_line = f.readline().decode().strip()
            while next_line.startswith("##"):
                header_lines = header_lines + 1
                next_line = f.readline().decode().strip()
    else:
        compress = None
        with open(vcf_file, 'r') as f:
            first_line = f.readline().strip()
            next_line = f.readline().strip()
            while next_line.startswith("##"):
                header_lines = header_lines + 1
                next_line = f.readline().strip()
    
    if first_line.endswith('VCFv4.1'):
        dataframe = pd.read_csv(vcf_file, compression=compress, sep=sep, skiprows=[header_lines], header=header_lines)

        sample = dataframe.columns[-1]
        dataframe.rename(columns={sample:'sample'}, inplace=True)
        
        for index, data_row in dataframe.iterrows():
            info_fields = re.findall(r';*([a-zA-Z]{1,20})=', data_row.INFO)
            info_values = re.findall(r'-?\d+\.?\d*e?[+-]?\d{0,2}', data_row.INFO)
            
            format_fields = data_row['FORMAT'].split(":")
            format_values = data_row['sample'].split(":")
                                    
            for ifield, ivalue in zip(info_fields,info_values):
                dataframe.loc[index,ifield] = ivalue
                
            for ffield, fvalue in zip(format_fields,format_values):
                dataframe.loc[index,ffield] = fvalue
                
        dataframe['FREQ'] = dataframe['FREQ'].str.replace('%', '')
                
        to_float = ['ADP', 'WT', 'HET', 'HOM', 'NC', 'GQ', 'SDP', 'DP',
                    'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR']
            
        for column in dataframe.columns:
            if column in to_float:
                dataframe[column] = dataframe[column].astype(float)    
        """    
        dataframe.rename(columns={'AF':'af'}, inplace=True)
        
        dataframe['len_AD'] = dataframe['AD'].str.split(",").str.len()
        dataframe['REF_AD'] = dataframe['AD'].str.split(",").str[0]

        dataframe['ALT_AD'] = dataframe.apply(calculate_ALT_AD, axis=1)
        dataframe[['gt0','gt1']] = dataframe['GT'].str.split(r'[/|\|]', expand=True)
        
        # this step remove false snps from cohort calling and reset index
        #dataframe = dataframe[dataframe.ALT_AD > 0].reset_index(drop=True)

        handle_polymorphism(dataframe) #Leave the most common variation
        dataframe['TYPE'] = dataframe.apply(define_var_type, axis=1)
        
    
        
        
        to_int = ['POS', 'len_AD', 'gt0', 'gt1']
        
        to_str = ['#CHROM','REF','ALT', 'FILTER']
        
        
                
        for column in dataframe.columns:
            if column in to_int:
                dataframe[column] = dataframe[column].astype(int)
                
        for column in dataframe.columns:
            if column in to_str:
                dataframe[column] = dataframe[column].astype(str)
                
        dataframe['dp'] = (dataframe['REF_AD'] + dataframe['ALT_AD'])
        dataframe['aF'] = dataframe['REF_AD']/dataframe['dp']
        dataframe['AF'] = dataframe['ALT_AD']/dataframe['dp']
        
        dataframe = dataframe.sort_values(by=['POS']).reset_index(drop=True)
        """
        
    else:
        print("This vcf file is not v4.2")
        sys.exit(1)
           
    return dataframe

In [33]:
for root, _, files in os.walk(folder):
    for name in files:
        if 'vcf' in name:
            filename = os.path.join(root, name)
            print(filename)

../data/201334.lowfreq.vcf.gz
../data/201277.lowfreq.vcf.gz


In [34]:
df = import_VCF41_to_pandas('../data/201334.lowfreq.vcf.gz')

In [35]:
df.columns

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'sample', 'ADP', 'WT', 'HET', 'HOM', 'NC', 'GT', 'GQ', 'SDP', 'DP',
       'RD', 'AD', 'FREQ', 'PVAL', 'RBQ', 'ABQ', 'RDF', 'RDR', 'ADF', 'ADR'],
      dtype='object')

In [36]:
df

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,ADP,WT,HET,HOM,NC,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR
0,NC_045512.2,49,.,T,G,.,PASS,ADP=129;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:151:129:123:5:3.88%:9.8E-1:36:19:0:123:0:5,129.0,0.0,1.0,0.0,0.0,0/1,0.0,151.0,129.0,123.0,5.0,3.88,0.98,36.0,19.0,0.0,123.0,0.0,5.0
1,NC_045512.2,394,.,T,G,.,PASS,ADP=47;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:48:47:44:2:4.26%:9.8E-1:37:38:34:10:0:2,47.0,0.0,1.0,0.0,0.0,0/1,0.0,48.0,47.0,44.0,2.0,4.26,0.98,37.0,38.0,34.0,10.0,0.0,2.0
2,NC_045512.2,411,.,G,A,.,PASS,ADP=48;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:51:48:46:2:4.17%:9.8E-1:37:24:35:11:2:0,48.0,0.0,1.0,0.0,0.0,0/1,0.0,51.0,48.0,46.0,2.0,4.17,0.98,37.0,24.0,35.0,11.0,2.0,0.0
3,NC_045512.2,424,.,A,G,.,PASS,ADP=52;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:53:52:49:3:5.77%:9.8E-1:36:36:35:14:2:1,52.0,0.0,1.0,0.0,0.0,0/1,0.0,53.0,52.0,49.0,3.0,5.77,0.98,36.0,36.0,35.0,14.0,2.0,1.0
4,NC_045512.2,456,.,T,C,.,PASS,ADP=194;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:262:194:190:4:2.06%:9.8E-1:48:47:157:33:3:1,194.0,0.0,1.0,0.0,0.0,0/1,0.0,262.0,194.0,190.0,4.0,2.06,0.98,48.0,47.0,157.0,33.0,3.0,1.0
5,NC_045512.2,517,.,T,C,.,PASS,ADP=201;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:314:201:195:5:2.5%:9.8E-1:56:58:149:46:4:1,201.0,0.0,1.0,0.0,0.0,0/1,0.0,314.0,201.0,195.0,5.0,2.5,0.98,56.0,58.0,149.0,46.0,4.0,1.0
6,NC_045512.2,627,.,T,C,.,PASS,ADP=212;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:311:212:206:6:2.83%:9.8E-1:51:73:126:80:6:0,212.0,0.0,1.0,0.0,0.0,0/1,0.0,311.0,212.0,206.0,6.0,2.83,0.98,51.0,73.0,126.0,80.0,6.0,0.0
7,NC_045512.2,1875,.,C,T,.,PASS,ADP=1221;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:1287:1221:485:697:57.08%:9.8E-1:37:29:54:431:299:398,1221.0,0.0,1.0,0.0,0.0,0/1,0.0,1287.0,1221.0,485.0,697.0,57.08,0.98,37.0,29.0,54.0,431.0,299.0,398.0
8,NC_045512.2,2750,.,A,G,.,PASS,ADP=241;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:410:241:234:6:2.49%:9.8E-1:59:62:187:47:5:1,241.0,0.0,1.0,0.0,0.0,0/1,0.0,410.0,241.0,234.0,6.0,2.49,0.98,59.0,62.0,187.0,47.0,5.0,1.0
9,NC_045512.2,2756,.,A,T,.,PASS,ADP=237;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:403:237:231:5:2.11%:9.8E-1:61:56:180:51:3:2,237.0,0.0,1.0,0.0,0.0,0/1,0.0,403.0,237.0,231.0,5.0,2.11,0.98,61.0,56.0,180.0,51.0,3.0,2.0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   #CHROM  110 non-null    object 
 1   POS     110 non-null    int64  
 2   ID      110 non-null    object 
 3   REF     110 non-null    object 
 4   ALT     110 non-null    object 
 5   QUAL    110 non-null    object 
 6   FILTER  110 non-null    object 
 7   INFO    110 non-null    object 
 8   FORMAT  110 non-null    object 
 9   sample  110 non-null    object 
 10  ADP     110 non-null    float64
 11  WT      110 non-null    float64
 12  HET     110 non-null    float64
 13  HOM     110 non-null    float64
 14  NC      110 non-null    float64
 15  GT      110 non-null    object 
 16  GQ      110 non-null    float64
 17  SDP     110 non-null    float64
 18  DP      110 non-null    float64
 19  RD      110 non-null    float64
 20  AD      110 non-null    float64
 21  FREQ    110 non-null    float64
 22  PV

In [39]:
df.shape

(110, 29)

In [42]:
df[df.FREQ > 50]

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,ADP,WT,HET,HOM,NC,GT,GQ,SDP,DP,RD,AD,FREQ,PVAL,RBQ,ABQ,RDF,RDR,ADF,ADR
7,NC_045512.2,1875,.,C,T,.,PASS,ADP=1221;WT=0;HET=1;HOM=0;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,0/1:0:1287:1221:485:697:57.08%:9.8E-1:37:29:54:431:299:398,1221.0,0.0,1.0,0.0,0.0,0/1,0.0,1287.0,1221.0,485.0,697.0,57.08,0.98,37.0,29.0,54.0,431.0,299.0,398.0
44,NC_045512.2,9477,.,T,A,.,PASS,ADP=721;WT=0;HET=0;HOM=1;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,1/1:0:1132:721:1:717:99.45%:9.8E-1:70:57:1:0:500:217,721.0,0.0,0.0,1.0,0.0,1/1,0.0,1132.0,721.0,1.0,717.0,99.45,0.98,70.0,57.0,1.0,0.0,500.0,217.0
56,NC_045512.2,14805,.,C,T,.,PASS,ADP=5748;WT=0;HET=0;HOM=1;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,1/1:0:10089:5748:7:5720:99.74%:9.8E-1:67:63:6:1:4419:1301,5748.0,0.0,0.0,1.0,0.0,1/1,0.0,10089.0,5748.0,7.0,5720.0,99.74,0.98,67.0,63.0,6.0,1.0,4419.0,1301.0
91,NC_045512.2,25979,.,G,T,.,PASS,ADP=1663;WT=0;HET=0;HOM=1;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,1/1:0:1701:1663:7:1649:99.16%:9.8E-1:37:38:7:0:1636:13,1663.0,0.0,0.0,1.0,0.0,1/1,0.0,1701.0,1663.0,7.0,1649.0,99.16,0.98,37.0,38.0,7.0,0.0,1636.0,13.0
99,NC_045512.2,28144,.,T,C,.,PASS,ADP=14686;WT=0;HET=0;HOM=1;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,1/1:0:15024:14686:13:14665:99.86%:9.8E-1:37:38:11:2:13871:794,14686.0,0.0,0.0,1.0,0.0,1/1,0.0,15024.0,14686.0,13.0,14665.0,99.86,0.98,37.0,38.0,11.0,2.0,13871.0,794.0
100,NC_045512.2,28657,.,C,T,.,PASS,ADP=1404;WT=0;HET=0;HOM=1;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,1/1:0:2112:1404:5:1397:99.5%:9.8E-1:67:54:4:1:1073:324,1404.0,0.0,0.0,1.0,0.0,1/1,0.0,2112.0,1404.0,5.0,1397.0,99.5,0.98,67.0,54.0,4.0,1.0,1073.0,324.0
101,NC_045512.2,28863,.,C,T,.,PASS,ADP=636;WT=0;HET=0;HOM=1;NC=0,GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR,1/1:0:768:636:2:631:99.37%:9.8E-1:52:44:2:0:183:448,636.0,0.0,0.0,1.0,0.0,1/1,0.0,768.0,636.0,2.0,631.0,99.37,0.98,52.0,44.0,2.0,0.0,183.0,448.0
