# A file for pre-processing variant data in Variant Call Format (VCF)

### TODO: consider marking presence of heterozygosity as a binary feature. Could be useful for labeling data in visualizations

## Total Runtime: ~5 minutes

In [61]:
import numpy as np
import pandas as pd

In [62]:
def parse_attributes(attributes):
    for attr in attributes:
        if "=" in attr:
            [key, val] = attr.split('=')
            yield (key, val)

def get_info_field_names(df):
    '''
    VCF records contain an INFO column with several fields (e.g. strand bias)
    separated by semicolons. This function gets the names of all of these fields.
    '''
    row_info = df['INFO'][0].split(';')
    names = set()
    for (key, val) in parse_attributes(row_info):
        names.add(key)
    return names

In [63]:
filename = 'data/VQSRfilter/NA12878.LowSeq.illumina.bwa.sorted.dedup.20.sam.wFlag.qual.raw.snps.vqsr.recal copy.vcf'

df = pd.read_csv(filename, sep='\t')
cols = set(df.columns)
cols -= {'INFO'}

# Get field names in the INFO column, except for non-numeric columns (consider accounting for these later on)
ignore_info_fields = {'DB', 'POSITIVE_TRAIN_SITE', 'NEGATIVE_TRAIN_SITE', 'culprit'}
info_field_names = get_info_field_names(df) - ignore_info_fields

cols |= info_field_names
cols |= {'PASSED_VQSR'}  # binary label indicating if variant passed VQSR filter
cols |= {'GROUND_TRUTH'}  # binary label indicating if the variant is true according to ground truth

## Copy the features into a new data frame

In [64]:
num_rows = df.shape[0]
features = pd.DataFrame(np.zeros(shape=(num_rows, len(cols))), columns=list(cols))
    
# Copy over all columns except for INFO and FILTER (i.e. the annotations and the VQSR filter result)
for col in df.columns:
    if col in cols and col != 'INFO' and col != 'FILTER':
        features[col] = df[col]
    
# Copy over the INFO and FILTER data
for index, row in df.iterrows():        
    attrs = row['INFO'].split(';')
    for (key, val) in parse_attributes(attrs):
        if key in cols:
            # TODO: This is NOT the correct/robust way to handle this case. This is the case where the
            # location has multiple calls (e.g. heterozygosity for example). For now, just take the first
            # attribute for simplicity
            if ',' in val:
                val = val.split(',')[0]
            features.set_value(index, key, val)
        
    passed_VQSR = row['FILTER']
    if passed_VQSR == 'PASS':
        features.set_value(index, 'PASSED_VQSR', 1)
    else:
        features.set_value(index, 'PASSED_VQSR', 0)    

# Obtain Ground Truth Labels

### Note: You must preprocess the ground truth file before running this. This ensures that only variants on chromosome 20 are present in the ground truth file.

In [65]:
# Build dictionary of ground truth variants
gt_file = 'data/preprocessed/ground_truth_chrom_20.txt'
gt_df = pd.read_csv(gt_file, sep='\t')
gnd_truth_dict = {}

for index, row in gt_df.iterrows(): 
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    gnd_truth_dict[pos] = (ref, alt)
       

In [66]:
gt_df.shape

(89426, 11)

In [67]:
# Lookup each variant in the VCF data and determine if it's a true variant
for index, row in features.iterrows():   
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    
    if pos in gnd_truth_dict and gnd_truth_dict[pos] == (ref, alt):
        features.set_value(index, 'GROUND_TRUTH', 1)
    else:
        features.set_value(index, 'GROUND_TRUTH', 0)

In [68]:
features.to_csv('data/preprocessed/vcf_features_with_VQSR_labels.txt', sep='\t')

### Display the names of features that may be used for filtering. Include the annotations in the INFO field and the quality score (QUAL). Later, consider incorporating the REF and ALT fields as well.

In [69]:
feature_names = info_field_names | {'QUAL'}
feature_names

{'AC',
 'AF',
 'AN',
 'BaseQRankSum',
 'ClippingRankSum',
 'DP',
 'FS',
 'MLEAC',
 'MLEAF',
 'MQ',
 'MQ0',
 'MQRankSum',
 'QD',
 'QUAL',
 'ReadPosRankSum',
 'SOR',
 'VQSLOD'}

### Summary Statistics

In [70]:
features.describe()

Unnamed: 0,PASSED_VQSR,AC,BaseQRankSum,MQRankSum,VQSLOD,AN,QUAL,FS,#CHROM,DP,...,MLEAF,MQ0,AF,POS,FILTER,SOR,QD,MQ,GROUND_TRUTH,ClippingRankSum
count,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,...,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0,86825.0
mean,0.786075,1.317409,-0.014988,-0.35926,13.972397,2.0,305.581818,2.124176,20.0,16.949911,...,0.658658,0.0,0.658704,30877850.0,0.0,1.175295,18.548858,57.873878,0.83461,-0.011719
std,0.410077,0.465471,1.206899,1.212392,10.261468,0.0,285.849485,3.937809,0.0,13.400111,...,0.232717,0.0,0.232735,18565830.0,0.0,0.771929,10.359626,5.106345,0.371534,0.732069
min,0.0,1.0,-6.28,-8.498,-81.87,2.0,10.2,0.0,20.0,1.0,...,0.5,0.0,0.5,61795.0,0.0,0.008,0.06,20.83,0.0,-3.575
25%,1.0,1.0,-0.572,-0.729,4.92,2.0,139.77,0.0,20.0,11.0,...,0.5,0.0,0.5,15398260.0,0.0,0.693,10.31,60.0,1.0,-0.358
50%,1.0,1.0,0.0,0.0,17.14,2.0,238.77,0.0,20.0,14.0,...,0.5,0.0,0.5,29472320.0,0.0,1.002,16.23,60.0,1.0,0.0
75%,1.0,2.0,0.546,0.135,21.05,2.0,410.77,2.75,20.0,18.0,...,1.0,0.0,1.0,47867030.0,0.0,1.492,29.62,60.0,1.0,0.32
max,1.0,2.0,6.495,4.704,25.2,2.0,6700.77,71.987,20.0,224.0,...,1.0,0.0,1.0,62962890.0,0.0,7.886,40.99,70.0,1.0,3.366


## 83.46% of calls in the VCF file are also in the ground truth file

## TODO: determine number of variants in ground truth that are not in the VCF file

In [71]:
features.shape

(86825, 27)

In [22]:
features.shape[0] * 0.7

60777.49999999999

#### There are 86825 total variant calls.  If we use a 70% training set, we can train on about 60,000 samples.