In [3]:
import vcf
from itertools import islice
from sklearn.linear_model import LogisticRegression
import sklearn
import random
random.seed(1)

In [4]:
features = ["NSF", "NSM", "NSN", "REF", "SYN", 
            "U3", "U5", "ASS", "DSS", "INT", "R3", "R5"]

In [5]:
mapping = {'0':'unkown',
           '1':'untested',
           '2':'non-pathogenic',
           '3':'probable non-pathogenic',
           '4':'probable pathogenic',
           '5':'pathogenic',
           '6':'drug response',
           '7':'histocompatibility',
           '255':'other'}

In [6]:
def random_boolean(prob=0.5):
    return random.uniform(0,1) < prob

In [63]:
def prepare_data(filename="clinvar.vcf.gz", n=None):
    reader = vcf.Reader(filename=filename, 
                    encoding='utf-8',
                    compressed=True)
    
    if n is not None:
        reader = islice(reader, n)

    variants = []
    clin_sig = []

    for v in reader:
        sig = v.INFO['CLNSIG']
        if len(sig) == 1:
            sig = sig[0]
            if '|' not in sig:
                sig = int(sig)
                if 2 <= sig <= 5:
                    clin_sig.append(sig)
                    variant_info = [f in v.INFO for f in features]
                    variants.append(variant_info)

    return np.array(variants), np.array(clin_sig)

In [8]:
def split_train_test(data, labels, prob=0.8):
    assert data.shape[0] == labels.shape[0]
    train = np.random.uniform(size=data.shape[0]) < prob
    test = np.invert(train)
    return data[train], data[test], labels[train], labels[test]

In [64]:
data, labels = prepare_data(n=10000)

In [10]:
def exclude_feature(data_train, labels_train, feature=0):
    feature_negative = np.invert(data_train[:,feature])
    return data_train[feature_negative], labels_train[feature_negative]

In [65]:
data_train, data_test, labels_train, labels_test = split_train_test(data, labels)

data_train_biased, labels_train_biased = exclude_feature(data_train, labels_train, 0)

In [98]:
logistic = LogisticRegression(C=0.1).fit(data_train, labels_train)

pr_train = logistic.predict(data_train)
sklearn.metrics.accuracy_score(labels_train, pr_train)

0.68962722852512159

In [99]:
pr_test = logistic.predict(data_test)
test_accuracy = sklearn.metrics.accuracy_score(labels_test, pr_test)
test_accuracy

0.68502202643171806

In [87]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [101]:
model = DecisionTreeClassifier().fit(data_train_biased, labels_train_biased)

In [102]:
pr = model.predict(data_test)

In [103]:
sklearn.metrics.accuracy_score(labels_test, pr)

0.70154185022026427

In [95]:
model.feature_importances_

array([ 0.10982887,  0.07874834,  0.08669554,  0.13483167,  0.37637308,
        0.01144786,  0.02688415,  0.02047467,  0.02662459,  0.0951942 ,
        0.01331497,  0.01958205])

In [91]:
model = RandomForestClassifier(100).fit(data_train, labels_train)

In [92]:
pr = forest.predict(data_test)

In [93]:
sklearn.metrics.accuracy_score(labels_test, pr)

0.69823788546255505

In [94]:
forest.feature_importances_

array([ 0.13200448,  0.07973197,  0.10685761,  0.12877877,  0.34086688,
        0.01155334,  0.02839986,  0.01912826,  0.02813924,  0.09281263,
        0.01436989,  0.01735707])