# Logistic Regression Filter

### This serves as a strong baseline, achieving a 92% ROC AUC score

In [45]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [36]:
def read_vcf_data(filename, drop_chrom_pos=True, drop_alt=True, drop_ref=True,
                  drop_nonnumeric_data=True, drop_VQSR_data=True):
    data = pd.read_csv(filename, sep='\t')
    data = data.drop(['#CHROM'], axis=1)
    data = data.drop(['ID'], axis=1)
    data = data.drop(['FORMAT'], axis=1)
    data = data.drop(['sample1'], axis=1)
    if drop_chrom_pos:
        data = data.drop(['POS'], axis=1)
    if drop_ref:
        data = data.drop(['REF'], axis=1)
    if drop_alt:
        data = data.drop(['ALT'], axis=1)
    if drop_nonnumeric_data:
        data = data.drop([], axis=1)
    if drop_VQSR_data:
        data = data.drop(['PASSED_VQSR', 'VQSLOD'], axis=1)
    return data    

In [37]:
df_train_full = read_vcf_data('data/preprocessed/vcf_features_test.txt')

## Split into train and dev

In [40]:
num_train = df_train_full.shape[0]
idx_train_dev = train_test_split(range(num_train), range(num_train), test_size = 0.3)
idx_train = idx_train_dev[0]
idx_dev = idx_train_dev[1]

df_train = df_train_full.loc[idx_train,:]
df_dev = df_train_full.loc[idx_dev,:]

df_train_Y = df_train_full['GROUND_TRUTH']
df_train_X = df_train_full.drop(['GROUND_TRUTH'], axis=1)

df_dev_Y = df_dev['GROUND_TRUTH']
df_dev_X = df_dev.drop(['GROUND_TRUTH'], axis=1)

## Fit Logistic Regression Model

In [41]:
num_folds = 3   # number of folds to use for cross-validation
loss_function = 'l2'  # Loss function to use. Must be either 'l1' or 'l2'
logreg_cv_L2 = linear_model.LogisticRegressionCV(cv=num_folds, penalty=loss_function)
logreg_cv_L2.fit(df_train_X, df_train_Y)

LogisticRegressionCV(Cs=10, class_weight=None, cv=3, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

## Make and evaluate predictions

In [52]:
predictions = logreg_cv_L2.predict(df_dev_X)
#conf_lr_cv = logreg_cv.decision_function(x_te)
prob_scores = logreg_cv_L2.predict_proba(df_dev_X)
prob_of_pos = prob_scores[:, 1]

score = roc_auc_score(df_dev_Y, prob_of_pos)

In [53]:
score

0.92312185206098241