In [36]:
import vcf
from itertools import islice
from sklearn.linear_model import LogisticRegression
import sklearn
import random
random.seed(1)

In [15]:
features = ["NSF", "NSM", "NSN", "REF", "SYN", 
            "U3", "U5", "ASS", "DSS", "INT", "R3", "R5"]

In [16]:
mapping = {'0':'unkown',
           '1':'untested',
           '2':'non-pathogenic',
           '3':'probable non-pathogenic',
           '4':'probable pathogenic',
           '5':'pathogenic',
           '6':'drug response',
           '7':'histocompatibility',
           '255':'other'}

In [17]:
def random_boolean(prob=0.5):
    return random.uniform(0,1) < prob

In [29]:
def prepare_data(filename="clinvar.vcf.gz", n=None):
    reader = vcf.Reader(filename=filename, 
                    encoding='utf-8',
                    compressed=True)
    
    if n is not None:
        reader = islice(reader, n)

    variants = []
    clin_sig = []

    for v in reader:
        sig = v.INFO['CLNSIG']
        if len(sig) == 1:
            sig = sig[0]
            if '|' not in sig:
                sig = int(sig)
                if sig == 2 or sig == 5:
                    clin_sig.append(sig)
                    variant_info = [f in v.INFO for f in features]
                    variants.append(variant_info)

    return np.array(variants), np.array(clin_sig)

In [30]:
def split_train_test(data, labels, prob=0.8):
    assert data.shape[0] == labels.shape[0]
    train = np.random.uniform(size=data.shape[0]) < prob
    test = np.invert(train)
    return data[train], data[test], labels[train], labels[test]

In [31]:
data, labels = prepare_data()

In [140]:
def exclude_feature(data_train, labels_train, feature=0):
    feature_negative = np.invert(data_train[:,feature])
    return data_train[feature_negative], labels_train[feature_negative]

In [151]:
data_train, data_test, labels_train, labels_test = split_train_test(data, labels)
data_train_biased, labels_train_biased = exclude_feature(data_train, labels_train, 1)

In [152]:
model_biased = LogisticRegression().fit(data_train_biased, labels_train_biased)

In [153]:
pr_train_biased = model_biased.predict(data_train_biased)
bias_train_accuracy = sklearn.metrics.accuracy_score(labels_train_biased, pr_train_biased)
bias_train_accuracy

0.94059276667972325

In [154]:
model_biased.coef_

array([[ 3.76326193,  0.        ,  4.75489905,  2.30891384, -4.92705175,
        -1.67820222, -1.35374587,  3.67738094,  4.53773854, -2.12268436,
         0.30787301, -0.02104479]])

In [155]:
pr_test = model_biased.predict(data_test)
bias_test_accuracy = sklearn.metrics.accuracy_score(labels_test, pr_test)
bias_test_accuracy

0.88543347513569015

In [156]:
model_regularized = LogisticRegression(C=0.001).fit(data_train_biased, labels_train_biased)

In [157]:
pr_train_regularized = model_regularized.predict(data_train_biased)
regularized_train_accuracy = sklearn.metrics.accuracy_score(labels_train_biased, pr_train_regularized)
regularized_train_accuracy

0.91663402532967753

In [158]:
pr_test = model_regularized.predict(data_test)
regularized_test_accuracy = sklearn.metrics.accuracy_score(labels_test, pr_test)
regularized_test_accuracy

0.87589848907143908

In [159]:
model_regularized.coef_

array([[ 0.82754514,  0.        ,  0.67805755,  0.52803158, -0.96629316,
        -0.04992243, -0.03072477,  0.18139155,  0.24689715, -0.54249605,
         0.00142071, -0.03500577]])

In [160]:
regularized_test_accuracy > bias_test_accuracy

False