In [1]:
import os
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [2]:
folder = "/home/laura/ANALYSIS/VARIANT_CALLING/MTB_ANC_2020/VCF/"



In [3]:
def import_VCF_to_pandas(vcf_file):
    header_lines = 0
    with open(vcf_file) as f:
        first_line = f.readline().strip()
        next_line = f.readline().strip()
        while next_line.startswith("##"):
            header_lines = header_lines + 1
            #print(next_line)
            next_line = f.readline()

    if first_line.startswith('##'):
        df = pd.read_csv(vcf_file, sep='\t', skiprows=[header_lines], header=header_lines)
        
        df['ALT']=df['ALT'].str.upper()
        df['REF']=df['REF'].str.upper()
        #Check INFO
        if 'INFO' in df.columns:
            return df
        else:
            last_column = df.columns[-1]
            df = df.rename(columns={last_column: 'INFO'})
            return df
    else:
        print("This vcf file is not properly formatted")
        sys.exit(1)

In [53]:
#import_VCF42_to_pandas("/home/laura/ANALYSIS/VARIANT_CALLING/MTB_ANC_2020/VCF/10082989-0-COL3.combined.hf.SNP.final.vcf").head()

In [85]:
def handle_polymorphism(df):
    for index, _ in df[df.len_AD > 2].iterrows():
        split_AD_all = df.loc[index, 'AD'].split(",")
        split_AD = split_AD_all[1:]
        split_AD = [int(x) for x in split_AD]
        maxAD = max(split_AD)
        max_index = split_AD.index(maxAD)#Obtain index from highest value in list of positions
        df.loc[index, 'len_AD'] = 2 #reset number of alternatives as normal
        df.loc[index, 'AD'] = split_AD_all[0] + ',' + str(maxAD)
        repare_headers = ['ALT', 'LEN', 'TYPE', 'ALT_QUAL', 'ALT_DP']
        for header in repare_headers:        
            df.loc[index, header] = df.loc[index, header].split(",")[max_index]  #split bases into list and retrieve the base using ps index

def import_VCF42_to_pandas(vcf_file, sep='\t'):
    """
    Script to read vcf 4.2
    - now handle correct allele frequency calculated by summing REF reads + ALT reads instead from DP parameter
    - now retrieve the largest read number for ALT allele frequency in case is a heterozygous SNP (depends on calculate_ALT_AD())
    - now uses dataframe.iterrows() instead dataframe.index
    - remove snps with two alternate alleles, keeping the most abundant if this is more at least 3 times more frequent
    """

    header_lines = 0
    if vcf_file.endswith(".gz"):
        with gzip.open(vcf_file, 'rb') as f:
            first_line = f.readline().decode().strip()
            next_line = f.readline().decode().strip()
            while next_line.startswith("##"):
                header_lines = header_lines + 1
                next_line = f.readline().decode().strip()
    else:
        with open(vcf_file, 'r') as f:
            first_line = f.readline().strip()
            next_line = f.readline().strip()
            while next_line.startswith("##"):
                header_lines = header_lines + 1
                next_line = f.readline().strip()
    
    if first_line.endswith('VCFv4.2'):
        
        #Use first line as header
        if vcf_file.endswith(".gz"):
            dataframe = pd.read_csv(vcf_file, compression='gzip', sep=sep, skiprows=[header_lines], header=header_lines)
        else:
            dataframe = pd.read_csv(vcf_file, sep=sep, skiprows=[header_lines], header=header_lines)

        sample = dataframe.columns[-1]
        dataframe.rename(columns={sample:'sample'}, inplace=True)
        
        for index, data_row in dataframe.iterrows():
            info_fields = [x.split('=')[0] for x in data_row.INFO.split(';')]
            info_values = [x.split('=')[1] for x in data_row.INFO.split(';')]
            
            format_fields = data_row['FORMAT'].split(":")
            format_values = data_row['sample'].split(":")
                                    
            for ifield, ivalue in zip(info_fields,info_values):
                dataframe.loc[index,ifield] = ivalue
                
            for ffield, fvalue in zip(format_fields,format_values):
                dataframe.loc[index,ffield] = fvalue
            
        dataframe.rename(columns={'RO':'REF_DP', 'AO':'ALT_DP', 'QR':'REF_QUAL', 'QA':'ALT_QUAL'}, inplace=True)
        
        dataframe['len_AD'] = dataframe['AD'].str.split(",").str.len()
        
        # this step remove false snps from cohort calling and reset index
        #dataframe = dataframe[dataframe.ALT_AD > 0].reset_index(drop=True)

        handle_polymorphism(dataframe) #Leave the most common variation        

        to_float = ['QUAL', 'AN', 'DP', 'FS', 'QD', 'SOR','GQ', 'REF_QUAL', 'ALT_QUAL']
        
        to_int = ['POS', 'REF_DP', 'ALT_DP', 'len_AD', 'gt0', 'gt1' ]
        
        to_str = ['#CHROM','REF','ALT', 'FILTER']
        
        for column in dataframe.columns:
            if column in to_float:
                dataframe[column] = dataframe[column].astype(float)
                
        for column in dataframe.columns:
            if column in to_int:
                dataframe[column] = dataframe[column].astype(int)
                
        for column in dataframe.columns:
            if column in to_str:
                dataframe[column] = dataframe[column].astype(str)
                
        dataframe['REF_FREQ'] = dataframe['REF_DP']/dataframe['DP']
        dataframe['ALT_FREQ'] = dataframe['ALT_DP']/dataframe['DP']
        
        dataframe = dataframe.sort_values(by=['POS']).reset_index(drop=True)
        
    else:
        print("This vcf file is not v4.2")
        sys.exit(1)
           
    return dataframe[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'AC', 'AF', 'AN', 'DP', 'DPB', 'LEN', 
       'NUMALT', 'ODDS', 'TYPE', 'GT', 'AD', 'len_AD', 'REF_DP', 'ALT_DP', 'REF_QUAL', 'ALT_QUAL', 'REF_FREQ', 'ALT_FREQ']]

In [86]:
dffree = import_VCF42_to_pandas("/home/laura/ANALYSIS/provaryota/Variants/var2.vcf")

In [87]:
dffree.head(10)

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,AC,AF,AN,DP,DPB,LEN,NUMALT,ODDS,TYPE,GT,AD,len_AD,REF_DP,ALT_DP,REF_QUAL,ALT_QUAL,REF_FREQ,ALT_FREQ
0,MTB_anc,1701,.,T,C,3441.6,.,2,1,2.0,115.0,115,1,1,162.643,snp,1/1,114,2,0,114,0.0,3896.0,0.0,0.991304
1,MTB_anc,1962,.,T,G,0.0,.,0,0,2.0,54.0,54,1,1,64.5762,snp,0/0,513,2,51,3,1508.0,41.0,0.944444,0.055556
2,MTB_anc,1969,.,T,G,0.0,.,0,0,2.0,55.0,55,1,1,57.6,snp,0/0,505,2,50,5,1534.0,66.0,0.909091,0.090909
3,MTB_anc,1970,.,CA,CC,4.29474e-15,.,0,0,2.0,56.0,56,1,1,63.2546,snp,0/0,524,2,52,4,1628.0,55.0,0.928571,0.071429
4,MTB_anc,1976,.,C,G,0.0,.,0,0,2.0,54.0,54,1,1,59.9927,snp,0/0,504,2,50,4,1527.0,58.0,0.925926,0.074074
5,MTB_anc,1978,.,T,G,5.30404e-15,.,0,0,2.0,54.0,54,1,1,60.0669,snp,0/0,495,2,49,5,1550.0,70.0,0.907407,0.092593
6,MTB_anc,2009,.,C,G,0.0,.,0,0,2.0,57.0,57,1,1,68.8316,snp,0/0,534,2,53,4,1720.0,48.0,0.929825,0.070175
7,MTB_anc,2022,.,A,G,2.96102e-15,.,0,0,2.0,53.0,53,1,1,60.8961,snp,0/0,485,2,48,5,1595.0,60.0,0.90566,0.09434
8,MTB_anc,2040,.,T,G,0.0,.,0,0,2.0,48.0,48,1,1,56.4241,snp,0/0,444,2,44,4,1567.0,51.0,0.916667,0.083333
9,MTB_anc,2532,.,C,T,1211.59,.,2,1,2.0,40.0,40,1,1,60.0569,snp,1/1,40,2,0,40,0.0,1416.0,0.0,1.0


In [65]:
dffree.columns

Index(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'sample', 'AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'CIGAR', 'DP', 'DPB',
       'DPRA', 'EPP', 'EPPR', 'GTI', 'LEN', 'MEANALT', 'MQM', 'MQMR', 'NS',
       'NUMALT', 'ODDS', 'PAIRED', 'PAIREDR', 'PAO', 'PQA', 'PQR', 'PRO', 'QA',
       'QR', 'RO', 'RPL', 'RPP', 'RPPR', 'RPR', 'RUN', 'SAF', 'SAP', 'SAR',
       'SRF', 'SRP', 'SRR', 'TYPE', 'technology.ILLUMINA', 'GT', 'AD', 'GL',
       'len_AD', 'REF_AD', 'ALT_AD', 'gt0', 'gt1', 'dp', 'aF'],
      dtype='object')

In [52]:
'NS' in dffree.columns

True

In [54]:
dffree[dffree.POS == 32188]

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,AB,ABP,AC,AF,AN,AO,CIGAR,DP,DPB,DPRA,EPP,EPPR,GTI,LEN,MEANALT,MQM,MQMR,NS,NUMALT,ODDS,PAIRED,PAIREDR,PAO,PQA,PQR,PRO,QA,QR,RO,RPL,RPP,RPPR,RPR,RUN,SAF,SAP,SAR,SRF,SRP,SRR,TYPE,technology.ILLUMINA,GT,AD,GL,len_AD,REF_AD,ALT_AD,gt0,gt1,dp,aF
65,MTB_anc,32188,.,G,"C,T",8.27989e-15,.,"AB=0,0;ABP=0,0;AC=0,0;AF=0,0;AN=2;AO=2,2;CIGAR=1X,1X;DP=33;DPB=33;DPRA=0,0;EPP=3.0103,7.35324;EPPR=3.08518;GTI=0;LEN=1,1;MEANALT=2,2;MQM=60,60;MQMR=60;NS=1;NUMALT=2;ODDS=37.9997;PAIRED=1,1;PAIREDR=1;PAO=0,0;PQA=0,0;PQR=0;PRO=0;QA=30,31;QR=931;RO=29;RPL=1,0;RPP=3.0103,7.35324;RPPR=3.68421;RPR=1,2;RUN=1,1;SAF=0,0;SAP=7.35324,7.35324;SAR=2,2;SRF=11;SRP=6.67934;SRR=18;TYPE=snp,snp;technology.ILLUMINA=1,1",GT:DP:AD:RO:QR:AO:QA:GL,"0/0:33:29,2,2:29:931:2,2:30,31:0,-6.63445,-81.2017,-6.53946,-79.6265,-81.1114",0,0,0,0,2.0,22,"1X,1X",33.0,33,0,"3.0103,7.35324",3.08518,0,11,22,6060,60,1,2,37.9997,11,1,0,0,0,0,3031,931,29,10,"3.0103,7.35324",3.68421,12,11,0,"7.35324,7.35324",22,11,6.67934,18,INDEL,11,0/0,2922,"0,-6.63445,-81.2017,-6.53946,-79.6265,-81.1114",3,29.0,2.0,0,0,31.0,0.935484


In [61]:
dffree.TYPE.value_counts()

snp        890
complex    32 
del        9  
ins        7  
Name: TYPE, dtype: int64

In [78]:
dffree[dffree.TYPE == 'complex']

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,AC,AF,AN,DP,DPB,LEN,NUMALT,ODDS,TYPE,GT,AD,len_AD,REF_DP,ALT_DP,REF_QUAL,ALT_QUAL
43,MTB_anc,20285,.,TGGGC,GGGGG,0.000622206,.,1,0.5,2,11,11.0,5,1,8.85072,complex,0/1,92,2,9,2,335,51
61,MTB_anc,32167,.,TGGC,GGGG,1.3388e-14,.,0,0,2,34,34.0,4,1,40.629,complex,0/0,312,2,31,2,940,37
111,MTB_anc,58973,.,TCCCG,GCCCC,4.10615e-08,.,0,0,2,20,20.8,5,1,18.4767,complex,0/0,152,2,15,2,400,43
160,MTB_anc,81389,.,GA,TT,3.84057e-07,.,0,0,2,15,15.0,2,1,16.241,complex,0/0,112,2,11,2,407,30
260,MTB_anc,145183,.,ACCA,CCCC,3.53333e-14,.,0,00,2,35,35.0,4,2,34.1707,complex,0/0,283,2,28,3,987,54
267,MTB_anc,145450,.,ATTG,GTTT,2.85637e-13,.,0,00,2,31,31.0,4,2,31.0102,complex,0/0,242,2,24,2,721,38
300,MTB_anc,174677,.,ACCAA,CCCAC,3.58141e-06,.,0,00,2,18,18.0,5,2,14.0446,complex,0/0,122,2,12,2,404,46
308,MTB_anc,175919,.,TCGGA,GCGGG,0.00013708,.,10,"0.5,0",2,15,15.0,5,2,10.3743,complex,0/1,103,2,10,3,349,51
309,MTB_anc,175930,.,TTTTC,GTTTT,1.12153e-07,.,0,0,2,22,22.8,5,1,17.472,complex,0/0,183,2,18,3,656,63
319,MTB_anc,177896,.,GGGGC,TGGGG,0.000126613,.,1,0.5,2,12,12.0,5,1,10.4429,complex,0/1,92,2,9,2,329,46


In [82]:
dffree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 938 entries, 0 to 937
Data columns (total 23 columns):
#CHROM      938 non-null object
POS         938 non-null int64
ID          938 non-null object
REF         938 non-null object
ALT         938 non-null object
QUAL        938 non-null float64
FILTER      938 non-null object
AC          938 non-null object
AF          938 non-null object
AN          938 non-null float64
DP          938 non-null float64
DPB         938 non-null object
LEN         938 non-null object
NUMALT      938 non-null object
ODDS        938 non-null object
TYPE        938 non-null object
GT          938 non-null object
AD          938 non-null object
len_AD      938 non-null int64
REF_DP      938 non-null int64
ALT_DP      938 non-null int64
REF_QUAL    938 non-null float64
ALT_QUAL    938 non-null float64
dtypes: float64(5), int64(4), object(14)
memory usage: 168.7+ KB
