In [63]:
import numpy as np
import pandas as pd

In [64]:
def parse_attributes(attributes):
    for attr in attributes:
        if "=" in attr:
            [key, val] = attr.split('=')
            yield (key, val)

def get_feature_names(df):
    row_info = df['INFO'][0].split(';')
    feature_names = set()
    for (key, val) in parse_attributes(row_info):
        feature_names.add(key)
    return feature_names

In [69]:
filename = 'data/VQSRfilter/NA12878.LowSeq.illumina.bwa.sorted.dedup.20.sam.wFlag.qual.raw.snps.vqsr.recal copy.vcf'

# Ignore non-numeric columns (consider accounting for these later on)
ignore_cols = {'DB', 'POSITIVE_TRAIN_SITE', 'culprit'}

df = pd.read_csv(filename, sep='\t')
cols = set(df.columns)
cols -= set(['INFO'])

feature_names = get_feature_names(df) - ignore_cols
cols |= feature_names
cols |= set(['PASSED_VQSR'])

In [66]:
num_rows = df.shape[0]
features = pd.DataFrame(np.zeros(shape=(num_rows, len(cols))), columns=list(cols))
    
# Copy over all columns except for INFO and FILTER (i.e. the annotations and the VQSR filter result)
for col in df.columns:
    if col in cols and col != 'INFO' and col != 'FILTER':
        features[col] = df[col]
    
# Copy over the INFO and FILTER data
for index, row in df.iterrows():        
    attrs = row['INFO'].split(';')
    for (key, val) in parse_attributes(attrs):
        if key in cols:
            # TODO: This is NOT the correct/robust way to handle this case. This is the case where the
            # location has multiple calls (e.g. heterozygosity for example). For now, just take the first
            # attribute for simplicity
            if ',' in val:
                val = val.split(',')[0]
            features.set_value(index, key, val)
        
    passed_VQSR = row['FILTER']
    if passed_VQSR == 'PASS':
        features.set_value(index, 'PASSED_VQSR', 1)
    else:
        features.set_value(index, 'PASSED_VQSR', 0)    

In [67]:
features.to_csv('data/preprocessed/vcf_features_with_VQSR_labels2.txt', sep='\t')

In [70]:
feature_names

{'AC',
 'AF',
 'AN',
 'BaseQRankSum',
 'ClippingRankSum',
 'DP',
 'FS',
 'MLEAC',
 'MLEAF',
 'MQ',
 'MQ0',
 'MQRankSum',
 'QD',
 'ReadPosRankSum',
 'SOR',
 'VQSLOD'}