In [36]:
import vcf
from itertools import islice
from sklearn.linear_model import LogisticRegression
import sklearn
import random
random.seed(1)

In [15]:
features = ["NSF", "NSM", "NSN", "REF", "SYN", 
            "U3", "U5", "ASS", "DSS", "INT", "R3", "R5"]

In [16]:
mapping = {'0':'unkown',
           '1':'untested',
           '2':'non-pathogenic',
           '3':'probable non-pathogenic',
           '4':'probable pathogenic',
           '5':'pathogenic',
           '6':'drug response',
           '7':'histocompatibility',
           '255':'other'}

In [17]:
def random_boolean(prob=0.5):
    return random.uniform(0,1) < prob

In [29]:
def prepare_data(filename="clinvar.vcf.gz", n=None):
    reader = vcf.Reader(filename=filename, 
                    encoding='utf-8',
                    compressed=True)
    
    if n is not None:
        reader = islice(reader, n)

    variants = []
    clin_sig = []

    for v in reader:
        sig = v.INFO['CLNSIG']
        if len(sig) == 1:
            sig = sig[0]
            if '|' not in sig:
                sig = int(sig)
                if sig == 2 or sig == 5:
                    clin_sig.append(sig)
                    variant_info = [f in v.INFO for f in features]
                    variants.append(variant_info)

    return np.array(variants), np.array(clin_sig)

In [30]:
def split_train_test(data, labels, prob=0.8):
    assert data.shape[0] == labels.shape[0]
    train = np.random.uniform(size=data.shape[0]) < prob
    test = np.invert(train)
    return data[train], data[test], labels[train], labels[test]

In [31]:
data, labels = prepare_data()

In [188]:
def exclude_feature(data_train, labels_train, feature=0):
    feature_negative = np.invert(data_train[:,feature])
    return data_train[feature_negative], labels_train[feature_negative]

In [199]:
data_train, data_test, labels_train, labels_test = split_train_test(data, labels)
data_train_biased, labels_train_biased = exclude_feature(data_train, labels_train, 0)

In [200]:
model_biased = LogisticRegression().fit(data_train_biased, labels_train_biased)

In [201]:
pr_train_biased = model_biased.predict(data_train_biased)
bias_train_accuracy = sklearn.metrics.accuracy_score(labels_train_biased, pr_train_biased)
bias_train_accuracy

0.86023257862743374

In [202]:
model_biased.coef_

array([[ 0.        ,  1.36996751,  4.9366988 ,  0.84714885, -2.71102135,
        -0.70572509, -0.50052068,  3.87750944,  4.91005792, -1.07942626,
         0.08312256, -0.10693305]])

In [203]:
pr_test = model_biased.predict(data_test)
bias_test_accuracy = sklearn.metrics.accuracy_score(labels_test, pr_test)
bias_test_accuracy

0.84616541353383457

In [204]:
model_regularized = LogisticRegression(C=0.01).fit(data_train_biased, labels_train_biased)

In [205]:
pr_train_regularized = model_regularized.predict(data_train_biased)
regularized_train_accuracy = sklearn.metrics.accuracy_score(labels_train_biased, pr_train_regularized)
regularized_train_accuracy

0.85974803982028014

In [206]:
pr_test = model_regularized.predict(data_test)
regularized_test_accuracy = sklearn.metrics.accuracy_score(labels_test, pr_test)
regularized_test_accuracy

0.85744360902255634

In [207]:
model_regularized.coef_

array([[ 0.        ,  0.99520228,  2.17250937,  0.75629343, -2.24797113,
        -0.31534148, -0.31479169,  1.13443461,  1.4126049 , -1.03676898,
         0.0165588 , -0.12199476]])

In [208]:
assert regularized_test_accuracy > bias_test_accuracy