# This notebook preprocesses variant data

### TODO: consider marking presence of heterozygosity as a binary feature. Could be useful for labeling data in visualizations

### Before running this, you must use GATK's VariantsToTable tool. This notebook performs additional preprocessing on the table generated by VariantsToTable.

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Ignore columns that are nonnumeric for now. Also drop AN and MQ0 which have no variance

In [2]:
ignore_cols = ['sample1.PL', 'sample1.AD', 'sample1.GT', 'REF', 'ALT', 'ID', 'CHROM', 'POS', 'AN', 'MQ0']

## Remove NANs from the VQSR-filtered SNP table. This table wil be useful in case we wish to use features computed by GATK (e.g. ReadPosRankSum) for our own algorithm.

In [3]:
filename = 'NA12878.LowSeq.illumina.bwa.sorted.dedup.20.sam.wFlag.qual.recalibrated.filtered'
in_file = '../data/vqsr_output/' + filename + '.table'

df_orig = pd.read_csv(in_file, sep='\t')
df = df_orig.drop(ignore_cols, axis=1)
df_filter = df['FILTER']
df = df.drop('FILTER', axis=1)

print df.columns
print df.shape

Index([u'QUAL', u'AC', u'AF', u'BaseQRankSum', u'ClippingRankSum', u'DB',
       u'DP', u'FS', u'MLEAC', u'MLEAF', u'MQ', u'MQRankSum',
       u'POSITIVE_TRAIN_SITE', u'QD', u'ReadPosRankSum', u'SOR', u'VQSLOD',
       u'culprit', u'FORMAT', u'sample1', u'sample1.DP', u'sample1.GQ'],
      dtype='object')
(86843, 22)


In [4]:
df.describe()



Unnamed: 0,QUAL,AC,AF,BaseQRankSum,ClippingRankSum,DP,FS,MLEAC,MLEAF,MQ,MQRankSum,QD,ReadPosRankSum,SOR,VQSLOD,FORMAT,sample1,sample1.DP,sample1.GQ
count,86843.0,86843.0,86843.0,59433.0,59433.0,86843.0,86843.0,86843.0,86843.0,86843.0,59433.0,86843.0,59402.0,86843.0,84521.0,0.0,0.0,86843.0,86843.0
mean,305.694981,1.317343,0.658671,-0.021814,-0.017144,16.953514,2.123864,1.317251,0.658625,57.872924,-0.525123,18.550534,0.100956,1.175315,14.530439,,,16.730767,75.993713
std,286.1776,0.465445,0.232722,1.458781,0.884873,13.412419,3.937501,0.465409,0.232704,5.107752,1.435626,10.359611,1.009716,0.771909,9.284664,,,13.057907,28.6166
min,10.2,1.0,0.5,-6.28,-3.575,1.0,0.0,1.0,0.5,20.83,-8.498,0.06,-4.343,0.008,-55.76,,,1.0,0.0
25%,139.77,1.0,0.5,,,11.0,0.0,1.0,0.5,60.0,,10.31,,0.693,,,,11.0,48.0
50%,238.77,1.0,0.5,,,14.0,0.0,1.0,0.5,60.0,,16.23,,1.002,,,,14.0,99.0
75%,411.77,2.0,1.0,,,18.0,2.7435,2.0,1.0,60.0,,29.62,,1.492,,,,18.0,99.0
max,6700.77,2.0,1.0,6.495,3.366,224.0,71.987,2.0,1.0,70.0,4.704,40.99,3.963,7.886,25.2,,,218.0,99.0


In [5]:
df.dropna(axis='columns', how='any', inplace=True)
print df.columns
df.describe()

Index([u'QUAL', u'AC', u'AF', u'DP', u'FS', u'MLEAC', u'MLEAF', u'MQ', u'QD',
       u'SOR', u'sample1.DP', u'sample1.GQ'],
      dtype='object')


Unnamed: 0,QUAL,AC,AF,DP,FS,MLEAC,MLEAF,MQ,QD,SOR,sample1.DP,sample1.GQ
count,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0
mean,305.694981,1.317343,0.658671,16.953514,2.123864,1.317251,0.658625,57.872924,18.550534,1.175315,16.730767,75.993713
std,286.1776,0.465445,0.232722,13.412419,3.937501,0.465409,0.232704,5.107752,10.359611,0.771909,13.057907,28.6166
min,10.2,1.0,0.5,1.0,0.0,1.0,0.5,20.83,0.06,0.008,1.0,0.0
25%,139.77,1.0,0.5,11.0,0.0,1.0,0.5,60.0,10.31,0.693,11.0,48.0
50%,238.77,1.0,0.5,14.0,0.0,1.0,0.5,60.0,16.23,1.002,14.0,99.0
75%,411.77,2.0,1.0,18.0,2.7435,2.0,1.0,60.0,29.62,1.492,18.0,99.0
max,6700.77,2.0,1.0,224.0,71.987,2.0,1.0,70.0,40.99,7.886,218.0,99.0


## Scale/normalize the VQSR-filtered SNP table

In [6]:
def scale_data_frame(df):
    return (df - df.mean()) / (df.max() - df.min())

In [7]:
df_scaled = scale_data_frame(df) #preprocessing.scale(df)

# Put the VQSR label back in
df_scaled['FILTER'] = df_filter

df_scaled.describe()

Unnamed: 0,QUAL,AC,AF,DP,FS,MLEAC,MLEAF,MQ,QD,SOR,sample1.DP,sample1.GQ
count,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0,86843.0
mean,4.847894e-14,-7.101326e-15,-7.101326e-15,-6.070093e-16,-6.682606e-16,1.426231e-14,1.426231e-14,-8.793134e-15,9.011419e-15,1.96153e-14,1.671576e-15,-3.702616e-15
std,0.04277328,0.4654448,0.4654448,0.06014538,0.05469738,0.4654086,0.4654086,0.1038794,0.2531056,0.09798293,0.06017468,0.2890566
min,-0.04416589,-0.3173428,-0.3173428,-0.07154042,-0.02950343,-0.3172507,-0.3172507,-0.7533643,-0.4517599,-0.148174,-0.07249201,-0.7676133
25%,-0.02479983,-0.3173428,-0.3173428,-0.02669737,-0.02950343,-0.3172507,-0.3172507,0.04325963,-0.2013324,-0.06122305,-0.02640906,-0.2827648
50%,-0.01000288,-0.3173428,-0.3173428,-0.01324446,-0.02950343,-0.3172507,-0.3172507,0.04325963,-0.0566952,-0.02199989,-0.01258418,0.2323867
75%,0.01585441,0.6826572,0.6826572,0.004692763,0.008607615,0.6827493,0.6827493,0.04325963,0.2704487,0.04019863,0.005849,0.2323867
max,0.9558341,0.6826572,0.6826572,0.9284596,0.9704966,0.6827493,0.6827493,0.2466357,0.5482401,0.851826,0.927508,0.2323867


In [8]:
df_scaled.max()

QUAL                             0.955834
AC                               0.682657
AF                               0.682657
DP                                0.92846
FS                               0.970497
MLEAC                            0.682749
MLEAF                            0.682749
MQ                               0.246636
QD                                0.54824
SOR                              0.851826
sample1.DP                       0.927508
sample1.GQ                       0.232387
FILTER        VQSRTrancheSNP99.90to100.00
dtype: object

In [9]:
out_file = '../data/processed2/' + filename + '.noNAN.scaled.table'
df_scaled.to_csv(out_file, sep='\t')

In [10]:
for f in df_filter:
    if f != "PASS":
        print f
        break

VQSRTrancheSNP99.00to99.90


## Append Ground Truth data to the VQSR-filtered table

In [11]:
# Build dictionary of ground truth variants
gt_file = '../data_old/preprocessed/ground_truth_chrom_20.txt'
gt_df = pd.read_csv(gt_file, sep='\t')
gnd_truth_dict = {}

for index, row in gt_df.iterrows(): 
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    gnd_truth_dict[pos] = (ref, alt)

In [12]:
gt_df.shape

(89426, 11)

### Note that there are more ground truth variant calls than variant calls in the non-ground-truth dataset. This reflects the fact that the sequencing pipeline doesn't discover all of the true variants. 

In [13]:
# Initialize the ground truth column to all 0's
df_scaled['GROUND_TRUTH'] = 0

# Lookup each variant in the VCF data and label it as a true variant or not
for index, row in df_orig.iterrows():   
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    
    if pos in gnd_truth_dict and gnd_truth_dict[pos] == (ref, alt):
        df_scaled.set_value(index, 'GROUND_TRUTH', 1)


In [14]:
filename = filename + '.noNAN.scaled.withGndTruthLabels'
out_file_with_labels = '../data/processed2/' + filename + '.table'
df_scaled.to_csv(out_file_with_labels, sep='\t')

In [15]:
num_true_variants = sum(df_scaled['GROUND_TRUTH'])
true_pos_unfiltered = 1.0*num_true_variants/df_scaled.shape[0]
print 'Sensitivity (True Pos Rate) Before Any Filtering: ', true_pos_unfiltered

Sensitivity (True Pos Rate) Before Any Filtering:  0.834356252087


### Important! 83.4% of the variants called in the chromosome 20 VCF file are true variants according to the ground truth. So any nontrivial algorithm should have a specificity less than 17%. If we assume that our VCF file contains all of the true pos variants, then our algorithm should attain a sensitivity > 84%.

# Remove NANs from the raw SNP table. TODO: update this with normalization and scaling...

In [None]:
raw_filename = 'NA12878.LowSeq.illumina.bwa.sorted.dedup.20.sam.wFlag.qual.raw.snps'
raw_in_file = '../data/processed/' + raw_filename + '.table'

raw_df = pd.read_csv(raw_in_file, sep='\t')
print raw_df.columns
print raw_df.shape
raw_df.describe()

In [None]:
raw_df.dropna(axis='columns', how='any', inplace=True)
print raw_df.columns
raw_df.describe()

In [None]:
raw_out_file = '../data/processed/' + raw_filename + '.noNAN.table'
raw_df.to_csv(raw_out_file, sep='\t')

## Split data into training (70%) and test (30%) sets 

In [16]:
num_samples = df_scaled.shape[0]
idx_train_test = train_test_split(range(num_samples), range(num_samples), test_size = 0.3)

In [17]:
idx_train = idx_train_test[0]
idx_test = idx_train_test[1]
assert bool(set(idx_train) & set(idx_test)) == False

df_train = df_scaled.loc[idx_train,:]
df_test = df_scaled.loc[idx_test,:]

num_train = df_train.shape[0]
num_test = df_test.shape[0]

assert num_train + num_test == num_samples

print 'Training set size: ' + str(num_train)
print 'Test set size: ' + str(num_test)

Training set size: 60790
Test set size: 26053


## Save the training and test sets

In [18]:
df_train.to_csv('../data/processed2/' + filename + '.train.table', sep='\t')
df_test.to_csv('../data/processed2/' + filename + '.test.table', sep='\t')