# This notebook preprocesses variant data. Before using this notebook, you must run the GATK ApplyRecalibration tool (ie VQSR) as follows:


# Then you must use GATK's VariantsToTable tool from the command line as follows:

### The code below performs additional preprocessing on the table generated by VariantsToTable.

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Ignore columns that are nonnumeric for now. Also drop AN and MQ0 which have no variance

In [2]:
ignore_cols = ['sample1.PL', 'sample1.AD', 'sample1.GT', 'REF', 'ALT', 'ID', 'CHROM', 'POS', 'AN', 'MQ0']

## Remove NANs from the VQSR-filtered SNP table. This table wil be useful in case we wish to use features computed by GATK (e.g. ReadPosRankSum) for our own algorithm.

In [4]:
filename = 'NA12878.LowSeq.illumina.bwa.sorted.dedup.11.sam.wFlag.qual.recalibrated.filtered'
in_file = '../../vqsr_output/chrom11/' + filename + '.table' #'../data/vqsr_output_all_chrom/' + filename + '.table'

df_orig = pd.read_csv(in_file, sep='\t')
df = df_orig.drop(ignore_cols, axis=1)
df_filter = df['FILTER']

# The POSITIVE_TRAIN_SITE and DB (dbSNP membership) columns are either true or NA. Convert True to 1 and NA to 0. 
df['POSITIVE_TRAIN_SITE'] = 1*df['POSITIVE_TRAIN_SITE'].fillna(0)
num_pos_train_sites = df['POSITIVE_TRAIN_SITE'].sum()
df['DB'] = 1*df['DB'].fillna(0)
num_dbSNP = df['DB'].sum()

print df.columns
print df.shape
print 'Current number of features (before removing NaNs): ', df.shape[1]
print 'Total number of variants: ', df.shape[0]
print 'VQSR used ', num_pos_train_sites, ' variants for training. As a pct: ', 100.0*num_pos_train_sites/df.shape[0]
print 'Pct of variants in dbSNP ', 100.0*num_dbSNP/df.shape[0]

Index([u'QUAL', u'FILTER', u'AC', u'AF', u'BaseQRankSum', u'ClippingRankSum',
       u'DB', u'DP', u'FS', u'MLEAC', u'MLEAF', u'MQ', u'MQRankSum',
       u'POSITIVE_TRAIN_SITE', u'QD', u'ReadPosRankSum', u'SOR', u'VQSLOD',
       u'culprit', u'FORMAT', u'sample1', u'sample1.DP', u'sample1.GQ'],
      dtype='object')
(187242, 23)
Current number of features (before removing NaNs):  23
Total number of variants:  187242
VQSR used  153698  variants for training. As a pct:  82.0852159238
Pct of variants in dbSNP  98.4538725286


In [5]:
df.describe()

  interpolation=interpolation)


Unnamed: 0,QUAL,AC,AF,BaseQRankSum,ClippingRankSum,DP,FS,MLEAC,MLEAF,MQ,MQRankSum,QD,ReadPosRankSum,SOR,VQSLOD,FORMAT,sample1,sample1.DP,sample1.GQ
count,187242.0,187242.0,187242.0,109756.0,109756.0,187242.0,187242.0,187242.0,187242.0,187242.0,109756.0,187242.0,109697.0,187242.0,183713.0,0.0,0.0,187242.0,187242.0
mean,317.913841,1.415767,0.707883,-0.022772,-0.021392,14.578535,1.620895,1.415671,0.707835,58.494981,-0.146604,20.858594,0.117065,1.143479,14.653301,,,14.359599,71.151446
std,288.665872,0.492855,0.246428,1.298535,0.89901,10.387486,3.484486,0.492839,0.246419,4.858719,1.035364,10.283196,0.979537,0.705455,7.205113,,,7.832364,29.629664
min,10.2,1.0,0.5,-6.966,-3.599,1.0,0.0,1.0,0.5,20.0,-7.739,0.02,-6.937,0.006,-380.1,,,1.0,0.0
25%,156.77,1.0,0.5,,,11.0,0.0,1.0,0.5,60.0,,12.06,,0.693,,,,11.0,42.0
50%,265.78,1.0,0.5,,,14.0,0.0,1.0,0.5,60.0,,18.87,,0.991,,,,14.0,84.0
75%,447.77,2.0,1.0,,,17.0,2.276,2.0,1.0,60.0,,31.75,,1.445,,,,17.0,99.0
max,18645.77,2.0,1.0,7.142,3.383,759.0,228.098,2.0,1.0,70.0,4.787,41.47,7.272,8.887,23.59,,,424.0,99.0


In [6]:
df.dropna(axis='columns', how='any', inplace=True)
print df.columns
df.describe()

Index([u'QUAL', u'FILTER', u'AC', u'AF', u'DB', u'DP', u'FS', u'MLEAC',
       u'MLEAF', u'MQ', u'POSITIVE_TRAIN_SITE', u'QD', u'SOR', u'sample1.DP',
       u'sample1.GQ'],
      dtype='object')


Unnamed: 0,QUAL,AC,AF,DP,FS,MLEAC,MLEAF,MQ,QD,SOR,sample1.DP,sample1.GQ
count,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0
mean,317.913841,1.415767,0.707883,14.578535,1.620895,1.415671,0.707835,58.494981,20.858594,1.143479,14.359599,71.151446
std,288.665872,0.492855,0.246428,10.387486,3.484486,0.492839,0.246419,4.858719,10.283196,0.705455,7.832364,29.629664
min,10.2,1.0,0.5,1.0,0.0,1.0,0.5,20.0,0.02,0.006,1.0,0.0
25%,156.77,1.0,0.5,11.0,0.0,1.0,0.5,60.0,12.06,0.693,11.0,42.0
50%,265.78,1.0,0.5,14.0,0.0,1.0,0.5,60.0,18.87,0.991,14.0,84.0
75%,447.77,2.0,1.0,17.0,2.276,2.0,1.0,60.0,31.75,1.445,17.0,99.0
max,18645.77,2.0,1.0,759.0,228.098,2.0,1.0,70.0,41.47,8.887,424.0,99.0


## Scale/normalize the VQSR-filtered SNP table (except categorical features)

In [7]:
def scale_data_frame(df):
    return (df - df.mean()) / (df.max() - df.min())

In [8]:
# Categorical features
cats = ['POSITIVE_TRAIN_SITE', 'DB', 'FILTER']

df_scaled = scale_data_frame(df.drop(cats, axis=1)) 
df_scaled[cats] = df[cats]

df_scaled.describe()

Unnamed: 0,QUAL,AC,AF,DP,FS,MLEAC,MLEAF,MQ,QD,SOR,sample1.DP,sample1.GQ
count,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0,187242.0
mean,-6.263256e-14,2.419946e-14,2.419946e-14,1.489728e-16,-3.212201e-17,1.344049e-15,1.344049e-15,-8.06861e-16,-7.060106e-15,3.923935e-14,-1.620469e-16,6.513593e-15
std,0.01549005,0.492855,0.492855,0.01370381,0.01527627,0.4928386,0.4928386,0.09717438,0.2480868,0.07943421,0.01851623,0.2992895
min,-0.01651218,-0.4157668,-0.4157668,-0.01791363,-0.007106135,-0.4156706,-0.4156706,-0.7698996,-0.5027405,-0.1280801,-0.03158298,-0.7187015
25%,-0.008647111,-0.4157668,-0.4157668,-0.004721022,-0.007106135,-0.4156706,-0.4156706,0.03010037,-0.2122701,-0.05072395,-0.007942314,-0.294459
50%,-0.002797545,-0.4157668,-0.4157668,-0.0007632384,-0.007106135,-0.4156706,-0.4156706,0.03010037,-0.04797573,-0.01716917,-0.0008501154,0.1297834
75%,0.006968188,0.5842332,0.5842332,0.003194545,0.002872033,0.5843294,0.5843294,0.03010037,0.2627601,0.0339512,0.006242083,0.2812985
max,0.9834878,0.5842332,0.5842332,0.9820864,0.9928939,0.5843294,0.5843294,0.2301004,0.4972595,0.8719199,0.968417,0.2812985


In [9]:
df_scaled.max()

QUAL                                      0.983488
AC                                        0.584233
AF                                        0.584233
DP                                        0.982086
FS                                        0.992894
MLEAC                                     0.584329
MLEAF                                     0.584329
MQ                                          0.2301
QD                                        0.497259
SOR                                        0.87192
sample1.DP                                0.968417
sample1.GQ                                0.281299
POSITIVE_TRAIN_SITE                              1
DB                                               1
FILTER                 VQSRTrancheSNP99.90to100.00
dtype: object

In [10]:
out_file = '../../data/processed/chrom11/' + filename + '.noNAN.scaled.table'
df_scaled.to_csv(out_file, sep='\t')

## Append Ground Truth data to the VQSR-filtered table

In [11]:
# Build dictionary of ground truth variants
gt_file = '../../data/processed/chrom11/ground_truth_chrom_11.txt'
gt_df = pd.read_csv(gt_file, sep='\t')
gnd_truth_dict = {}

for index, row in gt_df.iterrows(): 
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    gnd_truth_dict[pos] = (ref, alt)

In [12]:
gt_df.shape

(213022, 11)

### Note that there are more ground truth variant calls than variant calls in the non-ground-truth dataset. This reflects the fact that the sequencing pipeline doesn't discover all of the true variants. 

In [13]:
# Initialize the ground truth column to all 0's
df_scaled['GROUND_TRUTH'] = 0

# Lookup each variant in the VCF data and label it as a true variant or not
for index, row in df_orig.iterrows():   
    pos = row['POS']
    ref = row['REF']
    alt = row['ALT']
    
    if pos in gnd_truth_dict and gnd_truth_dict[pos] == (ref, alt):
        df_scaled.set_value(index, 'GROUND_TRUTH', 1)


In [14]:
filename = filename + '.noNAN.scaled.withGndTruthLabels'
out_file_with_labels = '../../data/processed/chrom11/' + filename + '.table'
df_scaled.to_csv(out_file_with_labels, sep='\t')

In [15]:
num_true_variants = sum(df_scaled['GROUND_TRUTH'])
true_pos_unfiltered = 1.0*num_true_variants/df_scaled.shape[0]
print 'Sensitivity (True Pos Rate) Before Any Filtering: ', true_pos_unfiltered

Sensitivity (True Pos Rate) Before Any Filtering:  0.934581984811


### Important! 93.5% of the variants called in the chromosome 11 VCF file are true variants according to the ground truth. So any nontrivial algorithm should have a specificity less than 6%. If we assume that our VCF file contains all of the true pos variants, then our algorithm should attain a sensitivity > 93.5%.

## Split data into training (70%) and test (30%) sets 

In [16]:
num_samples = df_scaled.shape[0]
idx_train_test = train_test_split(range(num_samples), range(num_samples), test_size = 0.3, random_state=0)

In [17]:
idx_train = idx_train_test[0]
idx_test = idx_train_test[1]
assert bool(set(idx_train) & set(idx_test)) == False

df_train = df_scaled.loc[idx_train,:]
df_test = df_scaled.loc[idx_test,:]

num_train = df_train.shape[0]
num_test = df_test.shape[0]

assert num_train + num_test == num_samples

print 'Training set size: ' + str(num_train)
print 'Test set size: ' + str(num_test)

Training set size: 131069
Test set size: 56173


## Save the training and test sets

In [18]:
df_train.to_csv('../../data/processed/chrom11/' + filename + '.train.table', sep='\t')
df_test.to_csv('../../data/processed/chrom11/' + filename + '.test.table', sep='\t')