In [37]:
import pandas as pd
from pyquery import PyQuery as pq
import numpy as np
import re
import requests

from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from gensim.models.ldamodel import LdaModel

import os
import json

# Classifiers
from sklearn.naive_bayes import MultinomialNB
import sklearn.linear_model as lm
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Text Extraction

In [2]:
###########
### Helper for text scrapping
###########

# Extract from the raw wikipedia page the text content
# of the paragraphs and the table of content as list of words
def get_text(data):
    # Extracting text content in paragraphs
    d_ = pq(data)
    d_p = pq(d_('#mw-content-text p'))
    paragraph = ''
    for r in d_p:
        paragraph += ' '+pq(r).text()

    # Extracing Table of Contents
    toc = []
    d_table = pq(d_('#toc li'))
    for c in d_table:
        t = pq(c).text()
        toc.append(' '.join(t.split(' ')[1:]))

    return paragraph, toc

# Return the title of the article, will be used to name the disease
def get_title(data):
    return pq(data)('#firstHeading').text()

# Check if string contains digit
def contains_digits(d):
    _digits = re.compile('\d')
    return bool(_digits.search(d))

# Return the common nouns in text (preprocessed) as a concatenated string
def get_words(thetext):
    stopwords=text.ENGLISH_STOP_WORDS
    punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')
    nouns = ''
    descriptives = ''
    proper = ''
    sentences = parse(thetext, tokenize=True, lemmata=True).split()
    for s in sentences:
        for token in s:
            if len(token[4]) > 1 and token[4] not in stopwords and token[4][0] not in punctuation and not contains_digits(token[4]):
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    descriptives += ' '+ token[4]
                elif token[1] in ['NN', 'NNS']:
                    nouns += ' '+ token[4]
                elif token[1] in ['NNP', 'NNPS']:
                    proper += ' '+ token[4]
    return nouns, descriptives, proper


# Script to retrieve and process the training text from the raw data

In [None]:
%%time

# Script to retrieve the information from the training set
for class_ in ['positive', 'negative']:
    path = 'training/{}'.format(class_)
    # List of considered files name
    files = [f for f in os.listdir("./"+path)]
    # File not to be considered
    bug = 'all-urls.txt'
    if bug in files:
        files.remove(bug)

    # Initializiation
    nouns_data = []
    descriptives_data = []
    proper_data = []
    toc_data = []
    titles = []

    # Time indicator: 
    #     19.3s for 100 files in positive
    #     4.43s for 100 files in negative
    for i, filename in enumerate(files):
        if i%100 == 0:
            print '{} iterations executed'.format(i)
            print '{} elements in the list'.format(len(nouns_data))
        with open(path + '/' + filename) as f:
            data = f.read()
        # Titles
        t = get_title(data)
        titles.append(t)
        # Content
        paragraph, toc = get_text(data)
        n, d, p = get_words(paragraph)
        nouns_data.append(n)
        descriptives_data.append(d)
        proper_data.append(p)
        toc_data.append(toc)

    # Saving the outputs
    with open('data/filesname_{}.json'.format(class_), 'w') as f:
        json.dump(files, f)
    with open('data/noun_list_{}.json'.format(class_), 'w') as f:
        json.dump(nouns_data, f)
    with open('data/descriptives_list_{}.json'.format(class_), 'w') as f:
        json.dump(descriptives_data, f)
    with open('data/proper_list_{}.json'.format(class_), 'w') as f:
        json.dump(proper_data, f)
    with open('data/toc_{}.json'.format(class_), 'w') as f:
        json.dump(toc_data, f)
    with open('data/title_list_{}.json'.format(class_), 'w') as f:
        json.dump(titles, f)

0 iterations executed
0 elements in the list
100 iterations executed
100 elements in the list
200 iterations executed
200 elements in the list
300 iterations executed
300 elements in the list
400 iterations executed
400 elements in the list
500 iterations executed
500 elements in the list
600 iterations executed

# Classification

### Exploratory analysis of different classifiers

In [46]:
###########
### Helper for feature extraction and model learning
###########

# Train the lda model to extract feature in the list of nouns in noun list.
# Return the trained model (tuple (lda_ge, count_vect)) and the document_topics matrix from the list of nouns.
def train_lda_model(noun_list, num_topics):
    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(noun_list)
    id2word = {k: v for v, k in count_vect.vocabulary_.iteritems()}
    corpus = build_corpus(X_counts)
    lda_ge = LdaModel(corpus, num_topics=num_topics, id2word=id2word)

    # Building the topics distribution for each document
    document_topics = np.zeros((len(corpus), num_topics))
    for i, bow in enumerate(corpus):
        dt = lda_ge.get_document_topics(bow)
        for t in dt:
            document_topics[i, t[0]] = t[1]

    return document_topics, lda_ge, count_vect


# Method to build the lda features for the document in the list of word
# counts vector X_counts based on a trained lda model.
def get_lda_features(X_counts, lda_ge):
    corpus = build_corpus(X_counts)
    infer = lda_ge.inference(corpus)[0]
    # Need to normalize the gamma to have the topic distribution
    document_topics = infer / np.sum(infer)

    return document_topics


# Build the corpus matrix in the required format to apply the lda
def build_corpus(X_counts):
    index = np.arange(X_counts.shape[1]).reshape((1, X_counts.shape[1]))
    corpus = []
    for i in xrange(X_counts.shape[0]):
        if (X_counts[i, :] > 0).sum():
            corpus.append([(w, c) for w, c in zip(index[np.where(
                X_counts[i, :].toarray())],
                np.array(X_counts[i, :][X_counts[i, :] > 0])[0])])
        else:
            corpus.append([])
    return corpus


# Function to build the occurence features and the pre-processing models from the list of nouns
def get_features(noun_list):
    # Building features
    count_vect = CountVectorizer()
    X_counts = count_vect.fit_transform(noun_list)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_counts)
    X_tf = tf_transformer.transform(X_counts)
    
    return X_tf, count_vect, tf_transformer
    

In [24]:
# Loading the pre_processed training data

# Nouns
with open('data/noun_list_negative.json', 'r') as f:
    list_negative1 = json.load(f)
with open('data/noun_list_positive.json', 'r') as f:
    list_positive1 = json.load(f)

# Descriptives
with open('data/descriptives_list_negative.json', 'r') as f:
    list_negative2 = json.load(f)
with open('data/descriptives_list_positive.json', 'r') as f:
    list_positive2 = json.load(f)

# Filesname
with open('data/filesname_positive.json', 'r') as f:
    files_positive = json.load(f)
with open('data/filesname_negative.json', 'r') as f:
    files_negatives = json.load(f)
    
num_negative = len(files_negatives)
num_positive = len(files_positive)

list_negative = [ u + v for u,v in zip(list_negative1, list_negative2)]
list_positive = [ u + v for u,v in zip(list_positive1, list_positive2)]

noun_list = list_negative + list_positive
filesname = files_negatives + files_positive

In [8]:
# Building the mask, ie splitting among train and test set

prop_train = 0.7

# Negative
mask_negative = np.zeros(num_negative, dtype=bool)
mask_negative[:int(prop_train * num_negative)] = True
np.random.shuffle(mask_negative)

# Positive
mask_positive = np.zeros(num_positive, dtype=bool)
mask_positive[:int(prop_train * (num_positive))] = True
np.random.shuffle(mask_positive)

mask = np.concatenate((mask_negative, mask_positive))
print(sum(mask))
print(int(prop_train * (num_negative + num_positive)))

9583
9583


In [10]:
%%time
# Stacked regression model

# Variables:
# noun_list, num_topics, mask
num_topics = 60

# Building the output column
Y = np.concatenate((np.zeros(num_negative), np.ones(num_positive)))

# Baseline
baseline_acc_train = accuracy_score(Y[np.where(mask)], np.zeros(sum(mask)), normalize=True, sample_weight=None)
baseline_acc_test = accuracy_score(Y[np.where(~mask)], np.zeros(sum(~mask)), normalize=True, sample_weight=None)
print('Baseline (All negative): score on train {}'.format(baseline_acc_train))
print('Baseline (All negative): score on test {}'.format(baseline_acc_test))

# LR on Frequency matrix
X_tf, count_vect_descriptive, tf_transformer_descriptive = get_features(noun_list)

lr_occ = lm.LogisticRegression(penalty='l2')
lr_occ.fit(X_tf[np.where(mask)], Y[np.where(mask)])
print 'LR occ: score on train {}'.format(lr_occ.score(X_tf[np.where(mask)], Y[np.where(mask)]))
print 'LR occ: score on test {}'.format(lr_occ.score(X_tf[np.where(~mask)], Y[np.where(~mask)]))

Y_pred_occ = np.zeros((len(Y), 1))
Y_pred_occ[mask, 0] = lr_occ.predict_proba(X_tf[np.where(mask)])[:, 0]
Y_pred_occ[~mask, 0] = lr_occ.predict_proba(X_tf[np.where(~mask)])[:, 0]

# LR on LDA features
document_topics, lda_ge, count_vect = train_lda_model(noun_list, num_topics)

lr_lda = lm.LogisticRegression(penalty='l2')
lr_lda.fit(document_topics[np.where(mask)], Y[np.where(mask)])
print 'LR lda: score on train {}'.format(lr_lda.score(document_topics[np.where(mask)], Y[np.where(mask)]))
print 'LR lda: score on test {}'.format(lr_lda.score(document_topics[np.where(~mask)], Y[np.where(~mask)]))

Y_pred_lda = np.zeros((len(Y), 1))
Y_pred_lda[mask, 0] = lr_lda.predict_proba(document_topics[np.where(mask)])[:, 0]
Y_pred_lda[~mask, 0] = lr_lda.predict_proba(document_topics[np.where(~mask)])[:, 0]

# LR on the result

X_lr = np.concatenate((Y_pred_occ, Y_pred_lda), axis=1)

est = lm.LogisticRegression(penalty='l2')
est.fit(X_lr[mask], Y[mask])
print 'stacked regression: score on train {}'.format(est.score(X_lr[np.where(mask)], Y[np.where(mask)]))
print 'stacked regression: score on test {}'.format(est.score(X_lr[np.where(~mask)], Y[np.where(~mask)]))




LR: score on train 0.989669205885
LR: score on test 0.984907497566
LR: score on train 0.972242512783
LR: score on test 0.97541382668
stacked regression: score on train 0.990190963164
stacked regression: score on test 0.985637779942
CPU times: user 1min 13s, sys: 1.85 s, total: 1min 15s
Wall time: 1min 18s


In [26]:
%%time
# Stacked SVM model

# Variables:
# noun_list, num_topics, mask
num_topics = 50

# Building the output column
Y = np.concatenate((np.zeros(num_negative), np.ones(num_positive)))

# Baseline
baseline_acc_train = accuracy_score(Y[np.where(mask)], np.zeros(sum(mask)), normalize=True, sample_weight=None)
baseline_acc_test = accuracy_score(Y[np.where(~mask)], np.zeros(sum(~mask)), normalize=True, sample_weight=None)
print('Baseline (All negative): score on train {}'.format(baseline_acc_train))
print('Baseline (All negative): score on test {}'.format(baseline_acc_test))

# SVM on Frequency matrix
X_tf, count_vect, tf_transformer = get_features(noun_list)

svm_occ = SVC(kernel = 'linear', probability=True)
svm_occ.fit(X_tf[np.where(mask)], Y[np.where(mask)])
print 'SVM occ: score on train {}'.format(svm_occ.score(X_tf[np.where(mask)], Y[np.where(mask)]))
print 'SVM occ: score on test {}'.format(svm_occ.score(X_tf[np.where(~mask)], Y[np.where(~mask)]))

Y_pred_occ = np.zeros((len(Y), 1))
Y_pred_occ[mask, 0] = svm_occ.predict_proba(X_tf[np.where(mask)])[:, 0]
Y_pred_occ[~mask, 0] = svm_occ.predict_proba(X_tf[np.where(~mask)])[:, 0]

# SVM on LDA features
document_topics, lda_ge, count_vect = train_lda_model(noun_list, num_topics)

svm_lda = SVC(kernel = 'linear', probability=True)
svm_lda.fit(document_topics[np.where(mask)], Y[np.where(mask)])
print 'SVM lda: score on train {}'.format(svm_lda.score(document_topics[np.where(mask)], Y[np.where(mask)]))
print 'SVM lda: score on test {}'.format(svm_lda.score(document_topics[np.where(~mask)], Y[np.where(~mask)]))

Y_pred_lda = np.zeros((len(Y), 1))
Y_pred_lda[mask, 0] = svm_lda.predict_proba(document_topics[np.where(mask)])[:, 0]
Y_pred_lda[~mask, 0] = svm_lda.predict_proba(document_topics[np.where(~mask)])[:, 0]

# SVM on the result

X_lr = np.concatenate((Y_pred_occ, Y_pred_lda), axis=1)

svm_stacked = SVC(kernel = 'linear', probability=True)
svm_stacked.fit(X_lr[mask], Y[mask])
print 'stacked svm: score on train {}'.format(svm_stacked.score(X_lr[np.where(mask)], Y[np.where(mask)]))
print 'stacked svm: score on test {}'.format(svm_stacked.score(X_lr[np.where(~mask)], Y[np.where(~mask)]))

SVM occ: score on train 0.998330376709
SVM occ: score on test 0.990506329114




SVM lda: score on train 0.976833976834
SVM lda: score on test 0.979795520935
stacked svm: score on train 0.998643431076
stacked svm: score on test 0.989776046738
CPU times: user 2min 11s, sys: 2.11 s, total: 2min 13s
Wall time: 2min 15s


In [614]:
# Set of classifiers tested

# Baseline
baseline_acc_train = accuracy_score(Y_train[np.where(mask)], np.zeros(sum(mask)), normalize=True, sample_weight=None)
baseline_acc_test = accuracy_score(Y_train[np.where(~mask)], np.zeros(sum(~mask)), normalize=True, sample_weight=None)
print('Baseline (All negative): score on train {}'.format(baseline_acc_train))
print('Baseline (All negative): score on test {}'.format(baseline_acc_test))

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tf[np.where(mask)], Y_train[np.where(mask)])
print 'NB: score on train {}'.format(nb.score(X_tf[np.where(mask)], Y_train[np.where(mask)]))
print 'NB: score on test {}'.format(nb.score(X_tf[np.where(~mask)], Y_train[np.where(~mask)]))

# Logistic Regression
lr = lm.LogisticRegression(penalty='l2')
lr.fit(X_train_tf[np.where(mask)], Y_train[np.where(mask)])
print 'LR: score on train {}'.format(lr.score(X_tf[np.where(mask)], Y_train[np.where(mask)]))
print 'LR: score on test {}'.format(lr.score(X_tf[np.where(~mask)], Y_train[np.where(~mask)]))

# SVM
svc = SVC(kernel = 'linear')
svc.fit(X_train_tf[np.where(mask)], Y_train[np.where(mask)])
print 'svm: score on train {}'.format(svc.score(X_tf[np.where(mask)], Y_train[np.where(mask)]))
print 'svm: score on test {}'.format(svc.score(X_tf[np.where(~mask)], Y_train[np.where(~mask)]))

Baseline (All negative): score on train 0.73046018992
Baseline (All negative): score on test 0.730282375852
NB: score on train 0.98737347386
NB: score on test 0.985637779942
LR: score on train 0.988730042784
LR: score on test 0.985150925024
svm: score on train 0.996556401962
svm: score on test 0.99123661149


In [33]:
# number of topics tuning
acc_test = []
# LR on LDA features
for num_topics in [20, 30, 40, 50, 60, 70]:
    document_topics, lda_ge, count_vect = train_lda_model(noun_list, num_topics)

    lr_lda_eval = SVC(kernel = 'linear')
    lr_lda_eval.fit(document_topics[np.where(mask)], Y[np.where(mask)])
    print 'LR: score on train {}'.format(lr_lda_eval.score(document_topics[np.where(mask)], Y[np.where(mask)]))
    sc = lr_lda_eval.score(document_topics[np.where(~mask)], Y[np.where(~mask)])
    print 'LR: score on test {} with {}'.format(sc, num_topics)
    acc_test.append(sc)



LR: score on train 0.951476573098
LR: score on test 0.95959104187 with 20




LR: score on train 0.970572889492
LR: score on test 0.972492697176 with 30




LR: score on train 0.97182510696
LR: score on test 0.968111002921 with 40




LR: score on train 0.966085776897
LR: score on test 0.974683544304 with 50




LR: score on train 0.97036418658
LR: score on test 0.974926971762 with 60




LR: score on train 0.969946780758
LR: score on test 0.967867575463 with 70


### Pipeline to build the selected classifier (called v1)

In [243]:
%%time
# Loading the pre_processed training data

# Nouns
with open('data/noun_list_negative.json', 'r') as f:
    list_negative1 = json.load(f)
with open('data/noun_list_positive.json', 'r') as f:
    list_positive1 = json.load(f)

# Descriptives
with open('data/descriptives_list_negative.json', 'r') as f:
    list_negative2 = json.load(f)
with open('data/descriptives_list_positive.json', 'r') as f:
    list_positive2 = json.load(f)

# Filesname
with open('data/filesname_positive.json', 'r') as f:
    files_positive = json.load(f)
with open('data/filesname_negative.json', 'r') as f:
    files_negatives = json.load(f)
    
num_negative = len(files_negatives)
num_positive = len(files_positive)

list_negative = [ u + v for u,v in zip(list_negative1, list_negative2)]
list_positive = [ u + v for u,v in zip(list_positive1, list_positive2)]

noun_list = list_negative + list_positive
filesname = files_negatives + files_positive

# Occurence SVM model training (on the whole dataset)

# Variables:
# noun_list, num_topics, mask
num_topics = 50

# Building the output column
Y = np.concatenate((np.zeros(num_negative), np.ones(num_positive)))

# Baseline
baseline_acc_train = accuracy_score(Y, np.zeros(Y.shape[0]), normalize=True, sample_weight=None)
print('Baseline (All negative): score on train {}'.format(baseline_acc_train))

# SVM on Frequency matrix
X_tf, count_vect, tf_transformer = get_features(noun_list)

svm_occ = SVC(kernel = 'linear', probability=True)
svm_occ.fit(X_tf, Y)
print 'SVM occ: score on train {}'.format(svm_occ.score(X_tf, Y))


Baseline (All negative): score on train 0.730406836608
SVM occ: score on train 0.997516616756
CPU times: user 1min 17s, sys: 894 ms, total: 1min 18s
Wall time: 1min 22s


In [244]:
# Saving the pre-trained occurence svm model
from sklearn.externals import joblib

# Saving models
joblib.dump(svm_occ, 'model_v1/svm_occ.pkl')
joblib.dump(count_vect, 'model_v1/count_vect.pkl')
joblib.dump(tf_transformer, 'model_v1/tf_transformer.pkl')

['model_v1/tf_transformer.pkl']

# Improving the classifier for drugs: expanding the training set

### Scrapping the training set expansion

In [166]:
# Loading the urls of drug pages
drugs_url = []

# List of drugs 1
source = 'https://en.wikipedia.org/wiki/Anesthetic'
t = requests.get(source)
d = pq(t.text)('#mw-content-text')

lis = d('.navbox-list li').remove('.new')
for e in lis:
    href = pq(e)('a').attr('href')
    if href is not None:
        drugs_url.append(href)

# List of drugs 2
source = 'https://en.wikipedia.org/wiki/List_of_antiviral_drugs'
t = requests.get(source)
d = pq(t.text)('#mw-content-text')

lis = d.remove('div#toc')('li').remove('.new')
for e in lis:
    href = pq(e)('a').attr('href')
    if href is not None:
        drugs_url.append(href)

# List of drugs 3
source = 'https://en.wikipedia.org/wiki/List_of_largest_selling_pharmaceutical_products'
t = requests.get(source)
d = pq(t.text)('#mw-content-text')
lis = d('table')('tr')

for tr in lis:
    href = pq(tr).children().eq(2)('a').attr('href')
    if href is not None:
        drugs_url.append(href)

# List of drugs 4
source = 'https://en.wikipedia.org/wiki/List_of_therapeutic_monoclonal_antibodies'
t = requests.get(source)
d = pq(t.text)('#mw-content-text')
lis = d('table')('tr')

for tr in lis:
    href = pq(tr).children().eq(0).remove('.new').remove('.reference')('a').attr('href')
    if href is not None:
        drugs_url.append(href)

In [225]:
# Removing the duplicates
files = list(set(drugs_url))

# Script to extract noun list from drug pages
wiki_url = 'https://en.wikipedia.org'
class_ = 'drugs'

# Initializiation
nouns_data = []
descriptives_data = []
proper_data = []
toc_data = []
titles = []

# Time indicator: 
#     19.3s for 100 files in positive
#     4.43s for 100 files in negative
for i, url in enumerate(files):
    if i%100 == 0:
        print '{} iterations executed'.format(i)
        print '{} elements in the list'.format(len(nouns_data))
    url_complete = wiki_url + url
    t = requests.get(url_complete)
    data = t.text
    # Titles
    title = get_title(data)
    titles.append(title)
    # Content
    paragraph, toc = get_text(data)
    n, d, p = get_words(paragraph)
    nouns_data.append(n)
    descriptives_data.append(d)
    proper_data.append(p)
    toc_data.append(toc)

# Saving the outputs
with open('data/filesname_{}.json'.format(class_), 'w') as f:
    json.dump(files, f)
with open('data/noun_list_{}.json'.format(class_), 'w') as f:
    json.dump(nouns_data, f)
with open('data/descriptives_list_{}.json'.format(class_), 'w') as f:
    json.dump(descriptives_data, f)
with open('data/proper_list_{}.json'.format(class_), 'w') as f:
    json.dump(proper_data, f)
with open('data/toc_{}.json'.format(class_), 'w') as f:
    json.dump(toc_data, f)
with open('data/title_list_{}.json'.format(class_), 'w') as f:
    json.dump(titles, f)


0 iterations executed
0 elements in the list
100 iterations executed
100 elements in the list
200 iterations executed
200 elements in the list
300 iterations executed
300 elements in the list
400 iterations executed
400 elements in the list
500 iterations executed
500 elements in the list
600 iterations executed
600 elements in the list
700 iterations executed
700 elements in the list


### Building the new model

#### VALIDATION

In [237]:
%%time

### Training a new model with negative training set = negative + drugs 
# with a test set (to validate the accuracy)

# Loading the pre_processed training data
# Nouns
with open('data/noun_list_negative.json', 'r') as f:
    list_negative11 = json.load(f)
with open('data/noun_list_drugs.json', 'r') as f:
    list_negative12 = json.load(f)
list_negative1 = list_negative11 + list_negative12
with open('data/noun_list_positive.json', 'r') as f:
    list_positive1 = json.load(f)

# Descriptives
with open('data/descriptives_list_negative.json', 'r') as f:
    list_negative21 = json.load(f)
with open('data/descriptives_list_drugs.json', 'r') as f:
    list_negative22 = json.load(f)
list_negative2 = list_negative21 + list_negative22
with open('data/descriptives_list_positive.json', 'r') as f:
    list_positive2 = json.load(f)

# Filesname
with open('data/filesname_negative.json', 'r') as f:
    files_negatives11 = json.load(f)
with open('data/filesname_drugs.json', 'r') as f:
    files_negatives12 = json.load(f)
files_negatives1 = files_negatives11 + files_negatives12
with open('data/filesname_positive.json', 'r') as f:
    files_positive = json.load(f)

    
num_negative = len(files_negatives1)
num_positive = len(files_positive)

list_negative = [ u + v for u,v in zip(list_negative1, list_negative2)]
list_positive = [ u + v for u,v in zip(list_positive1, list_positive2)]

noun_list = list_negative + list_positive
filesname = files_negatives + files_positive

# Building the mask, ie splitting among train and test set
prop_train = 0.7

# Negative
mask_negative = np.zeros(num_negative, dtype=bool)
mask_negative[:int(prop_train * num_negative)] = True
np.random.shuffle(mask_negative)

# Positive
mask_positive = np.zeros(num_positive, dtype=bool)
mask_positive[:int(prop_train * (num_positive))] = True
np.random.shuffle(mask_positive)

mask = np.concatenate((mask_negative, mask_positive))
print(sum(mask))
print(int(prop_train * (num_negative)) + int(prop_train * (num_positive)))

# SVM model

# Variables:
# noun_list, num_topics, mask
num_topics = 50

# Building the output column
Y = np.concatenate((np.zeros(num_negative), np.ones(num_positive)))

# Baseline
baseline_acc_train = accuracy_score(Y[np.where(mask)], np.zeros(sum(mask)), normalize=True, sample_weight=None)
baseline_acc_test = accuracy_score(Y[np.where(~mask)], np.zeros(sum(~mask)), normalize=True, sample_weight=None)
print('Baseline (All negative): score on train {}'.format(baseline_acc_train))
print('Baseline (All negative): score on test {}'.format(baseline_acc_test))

# SVM on Frequency matrix
X_tf, count_vect2, tf_transformer2 = get_features(noun_list)

svm_occ2 = SVC(kernel = 'linear', probability=True)
svm_occ2.fit(X_tf[np.where(mask)], Y[np.where(mask)])
print 'SVM occ: score on train {}'.format(svm_occ2.score(X_tf[np.where(mask)], Y[np.where(mask)]))
print 'SVM occ: score on test {}'.format(svm_occ2.score(X_tf[np.where(~mask)], Y[np.where(~mask)]))

10079
10079
Baseline (All negative): score on train 0.743724575851
Baseline (All negative): score on test 0.743577875492
SVM occ: score on train 0.99573370374
SVM occ: score on test 0.987040037028
CPU times: user 1min 13s, sys: 1.28 s, total: 1min 15s
Wall time: 1min 19s


In [236]:
# Check that the drug tested are not in the test set
for u,n in zip(urls, names):
    X_c = get_counts(u, count_vect2)
    print 'Prediction on {}: {}'.format(n, predict_occ(X_c, tf_transformer2, svm_occ2))

False

#### Building a final classifier (called v2)

In [238]:
%%time

### Training a new model with negative training set = negative + drugs 
# with a test set (to validate the accuracy)

# Loading the pre_processed training data
# Nouns
with open('data/noun_list_negative.json', 'r') as f:
    list_negative11 = json.load(f)
with open('data/noun_list_drugs.json', 'r') as f:
    list_negative12 = json.load(f)
list_negative1 = list_negative11 + list_negative12
with open('data/noun_list_positive.json', 'r') as f:
    list_positive1 = json.load(f)

# Descriptives
with open('data/descriptives_list_negative.json', 'r') as f:
    list_negative21 = json.load(f)
with open('data/descriptives_list_drugs.json', 'r') as f:
    list_negative22 = json.load(f)
list_negative2 = list_negative21 + list_negative22
with open('data/descriptives_list_positive.json', 'r') as f:
    list_positive2 = json.load(f)

# Filesname
with open('data/filesname_negative.json', 'r') as f:
    files_negatives11 = json.load(f)
with open('data/filesname_drugs.json', 'r') as f:
    files_negatives12 = json.load(f)
files_negatives1 = files_negatives11 + files_negatives12
with open('data/filesname_positive.json', 'r') as f:
    files_positive = json.load(f)

    
num_negative = len(files_negatives1)
num_positive = len(files_positive)

list_negative = [ u + v for u,v in zip(list_negative1, list_negative2)]
list_positive = [ u + v for u,v in zip(list_positive1, list_positive2)]

noun_list = list_negative + list_positive
filesname = files_negatives + files_positive

# SVM model

# Variables:
# noun_list, num_topics, mask
num_topics = 50

# Building the output column
Y = np.concatenate((np.zeros(num_negative), np.ones(num_positive)))

# Baseline
baseline_acc_train = accuracy_score(Y, np.zeros(Y.shape[0]), normalize=True, sample_weight=None)
print('Baseline (All negative): score on train {}'.format(baseline_acc_train))

# SVM on Frequency matrix
X_tf, count_vect2, tf_transformer2 = get_features(noun_list)

svm_occ2 = SVC(kernel = 'linear', probability=True)
svm_occ2.fit(X_tf, Y)
print 'SVM occ: score on train {}'.format(svm_occ2.score(X_tf, Y))

Baseline (All negative): score on train 0.743680555556
SVM occ: score on train 0.995972222222
CPU times: user 1min 54s, sys: 1.14 s, total: 1min 55s
Wall time: 1min 59s


In [242]:
# Saving the pre-trained occurence svm model
from sklearn.externals import joblib

# Saving models
joblib.dump(svm_occ2, 'model_v2/svm_occ.pkl')
joblib.dump(count_vect2, 'model_v2/count_vect.pkl')
joblib.dump(tf_transformer2, 'model_v2/tf_transformer.pkl')

['model_v2/tf_transformer.pkl']

# Prediction on new data

In [241]:
###########
### Helper for prediction
###########

# Predict probability response of the occurrences model
def predict_occ(X_counts, tf_transformer, svc):
    # Feature Extraction
    X_train_tf = tf_transformer.transform(X_counts)
    # Prediction
    y = svc.predict_proba(X_train_tf)[0][0]
    return y

# Predict probability response of the lda model
def predict_lda(X_counts, lda_ge, svc_lda):
    dt_new = get_lda_features(X_counts, lda_ge)
    y1 = svc_lda.predict_proba(dt_new)[0][0]
    
    return y1

# Retrieve the count matrix with the learned count_vect from the url
def get_counts(url, count_vect):
    t = requests.get(url)
    paragraph, toc = get_text(t.text)
    n, d, p = get_words(paragraph)
    X_counts = count_vect.transform([n + d])
    return X_counts

# Prediction with the stacked regression on new data from url
def predict_url_stacked(url, count_vect, tf_transformer, svc_occ, lda_ge, svc_lda, svc_stacked):
    X_counts = get_counts(url, count_vect)
    # Occurrence model
    y_occ = predict_occ(X_counts, tf_transformer, svc_occ)
    # Lda model
    y_lda = predict_lda(X_counts, lda_ge, svc_lda)
    # Stacked model
    y = svc_stacked.predict(np.array([[y_occ, y_lda]]))
    
    return y

# Edges cases

### Drug missclassification

In [28]:
# Defining the drugs
urls = ['https://en.wikipedia.org/wiki/Penicillin', 'https://en.wikipedia.org/wiki/Paracetamol',
        'https://en.wikipedia.org/wiki/L-DOPA', 'https://en.wikipedia.org/wiki/Erythromycin']
names = [u.split('/')[-1] for u in urls]

In [232]:
# Comparison of our 3 models on the drug classification task

for u,n in zip(urls, names):
    X_c = get_counts(u, count_vect)
    print 'Prediction on {}: {}'.format(n, predict_lda(X_c, lda_ge, svm_lda))
    print 'Prediction on {}: {}'.format(n, predict_occ(X_c, tf_transformer, svm_occ))
    y = predict_url_stacked(u, count_vect, tf_transformer, svm_occ, lda_ge, svm_lda, svm_stacked)
    print 'Prediction on {}: {}'.format(n, y)

Prediction on Penicillin: 0.110800922733
Prediction on Penicillin: 0.974746035987
Prediction on Penicillin: [ 0.]
Prediction on Paracetamol: 0.268033040857
Prediction on Paracetamol: 0.949935650229
Prediction on Paracetamol: [ 0.]
Prediction on L-DOPA: 0.134786096126
Prediction on L-DOPA: 0.968285484337
Prediction on L-DOPA: [ 0.]
Prediction on Erythromycin: 0.108924121663
Prediction on Erythromycin: 0.995330926366
Prediction on Erythromycin: [ 0.]


### Generic class of disease

In [577]:
with open('noun_list_negative.json', 'r') as f:
    nouns_list_negative = json.load(f)
with open('noun_list_positive2.json', 'r') as f:
    nouns_list_positive = json.load(f)

with open('filesname_positive.json', 'r') as f:
    files_positive = json.load(f)
with open('filesname_negative.json', 'r') as f:
    files_negatives = json.load(f)


In [597]:
generic_words = ['virus', 'infection', 'bacteria', 'mutation', 'genetic', 'disorders']
for i,f in enumerate(files_positive):
    for w in generic_words:
        if w in f:
            files_positive.remove(f)
            nouns_list_positive.remove(nouns_list_positive[i])
            break

In [605]:
num_negative = len(files_negatives)
num_positive = len(files_positive)

noun_list = nouns_list_negative + nouns_list_positive
filesname = files_negatives + files_positive

# Building the output column
Y_generic = np.concatenate((np.zeros(num_negative), np.ones(num_positive)))

X_tf_generic, count_vect_generic, tf_transformer_generic = get_features(noun_list, filesname)

In [606]:
# Splitting among train and test set
# First choice of 70 - 30
prop_train = 0.7

# Negative
mask_negative = np.zeros(num_negative, dtype=bool)
mask_negative[:int(prop_train * num_negative)] = True
np.random.shuffle(mask_negative)

# Positive
mask_positive = np.zeros(num_positive, dtype=bool)
mask_positive[:int(prop_train * (num_positive))] = True
np.random.shuffle(mask_positive)

mask = np.concatenate((mask_negative, mask_positive))
print(sum(mask))
print(int(prop_train * (num_negative + num_positive)))

9552
9552


In [615]:
# Comparaison of the full classifier and the one with page removed
y_generic = predict_url('https://en.wikipedia.org/wiki/Cancer', count_vect_generic, tf_transformer_generic, svc_generic)
y = predict_url('https://en.wikipedia.org/wiki/Cancer', count_vect, tf_transformer, svc)
print('Prediction with broad articles ', y)
print('Prediction without broad articles ', y_generic)

('Prediction with broad articles ', True)
('Prediction without broad articles ', True)


# Results

In [13]:
# Building a dataframe with the article name (not storing the features because in sparse format)

# Building the prediction vector

Y_pred = np.zeros(len(Y))
Y_pred[mask] = svm_stacked.predict(X_lr[np.where(mask)])
Y_pred[~mask] = svm_stacked.predict(X_lr[np.where(~mask)])

# Processing the titles
with open('title_listnegative.json', 'r') as f:
    titles_negatives = json.load(f)
with open('title_listpositive.json', 'r') as f:
    titles_positive = json.load(f)

titles = titles_negatives + titles_positive

df = pd.DataFrame(data={'True': Y, 'Prediction': Y_pred, 'Train':mask, 'Filename':filesname, 'Titles':titles})
df.Filename = df.Filename.astype(unicode)

In [14]:
# Missclassification: This is to try to understand the pages we missclassified.
# It seems that many negative ones look like desease one so the confusion is not surprising.
df[(df.Prediction != df.True)]

Unnamed: 0,Filename,Prediction,Titles,Train,True
172,1152,1,Rapid eye movement behavior disorder,False,0
391,135,1,LIN28,True,0
687,1616,1,Pre-B-cell leukemia homeobox,True,0
889,1799,1,Geriatric trauma,True,0
1094,1983,1,Occult pneumonia,False,0
1133,2017,1,Tranilast,False,0
1880,2690,1,Osteomyelitis of the jaws,True,0
1895,2703,1,List of aquarium diseases,True,0
2305,3072,1,Protozoan infection,True,0
2631,3366,1,Aphonia,True,0


In [259]:
###########
### Helper for information extraction
###########


def get_description_from_file(filename, path, kw_list):
    # identify if the class is negative or positive
    only_digit = re.compile('^[0-9]*$')
    if bool(only_digit.search(filename)):
        class_ = 'negative'
    else:
        class_ = 'positive'

    with open(path + class_ + '/' + filename) as f:
        data = f.read()
    return get_information(t.text, kw_list)


def get_description_from_url(url, kw_list):
    t = requests.get(url)
    title = get_title(t.text)
    return title, get_information(t.text, kw_list)
   

# Return sentences containing specific key-words
def get_information(data, kw_list):
    # Extracting Content
    paragraph, toc = get_text(data)
    # Result
    kw_to_info = {kw: u'' for kw in kw_list}

    # Removing the possible link
    sp_clean = [t for t in paragraph.split() if t not in ['[', ']'] and not contains_digits(t)]
    paragraph_clean = ' '.join(sp_clean)

    sentences = paragraph_clean.split('. ')
    for s in sentences:
        # Adding the content to corresponding kw if present
        # (by default the first one is chosen)
        for kw in kw_list:
            if kw in s:
                kw_to_info[kw] += s + u'. '
                break
    return kw_to_info


# Final function to classify a new entry given a pre-trained model and extract its information
def classify_from_url(url, count_vect, tf_transformer, svm_occ, kw_list):
    # Classification
    X_counts = get_counts(url, count_vect)
    y = predict_occ(X_counts, tf_transformer, svm_occ)
    res = bool(y<0.5)
    # Extracting information if needed
    if res:
        title, description = get_description_from_url(url, kw_list)
        return res, title, description
    else:
        return res, None, None


# Print in a pretty way the dictionary of disease description
def pretty_print_description(description):
    print u'Description of the disease: \n'
    for k, v in description.iteritems():
        if v:
            print(k.upper())
            print v + u' \n'

In [29]:
%%time
# Script to extract information for the prediction on the whole training set (for the entries predicted as a disease)
kw_list = ['symptom', 'cause', 'prognosis', 'prevention', 'treatment', 'drug', 'susceptibility', 'feature', 'frequent']
path = 'training/'

disease_filename = df[(df['Prediction'] == 1) & df['True']].Filename

# Extract disease name
df['disease'] = ''
df['disease'][(df.Prediction == True) | (df.True == True)] = df['Titles'][(df.Prediction == True) | (df.True == True)]

# Extract disease description
df['description'] = u''
df['description'][(df.Prediction == True) | (df.True == True)] = (df[(df.Prediction == True) | (df.True == True)]).apply(lambda row: get_description_from_file(row.Filename, path, kw_list), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 38.3 s, sys: 1.38 s, total: 39.7 s
Wall time: 44.6 s


In [42]:
# Illustration on 5 well classified entries
df[(df.Prediction == True) & (df.True == True)].head(5)

Unnamed: 0,Filename,Prediction,Titles,Train,True,disease,description
10000,Aagenaes_syndrome,1,Aagenaes syndrome,False,1,Aagenaes syndrome,Aagenaes syndrome is a syndrome characterised...
10001,Aarskog%E2%80%93Ose%E2%80%93Pande_syndrome,1,SHORT syndrome,True,1,SHORT syndrome,
10002,Aarskog_syndrome,1,Aarskog–Scott syndrome,True,1,Aarskog–Scott syndrome,The syndrome is caused by mutation in a gene c...
10003,Aase%E2%80%93Smith_syndrome,1,Aase syndrome,True,1,Aase syndrome,The anemia is caused by underdevelopment of th...
10004,Aase_syndrome,1,Aase syndrome,True,1,Aase syndrome,The anemia is caused by underdevelopment of th...


In [264]:
%%time 
kw_list = ['symptom', 'cause', 'prognosis', 'prevention',
               'treatment', 'drug', 'susceptibility', 'diagnosis']
class_pred = {True: 'Disease', False: 'Non Disease'}

urls_test = ['https://en.wikipedia.org/wiki/Paracetamol', 'https://en.wikipedia.org/wiki/Aagenaes_syndrome',
        'https://en.wikipedia.org/wiki/Cancer', 'https://en.wikipedia.org/wiki/Diabetes_mellitus']
names_test = [u.split('/')[-1] for u in urls_test]

for n, url in zip(names_test, urls_test):
    y, title, description = classify_from_url(url, count_vect2, tf_transformer2, svm_occ2, kw_list)
    print u'Prediction on {}: {} \n'.format(n, class_pred[y])
    if title is not None:
        print u'Name of the disease: {}\n'.format(title)
        pretty_print_description(description)

Prediction on Paracetamol: Non Disease 

Prediction on Aagenaes_syndrome: Disease 

Name of the disease: Aagenaes syndrome

Description of the disease: 

CAUSE
Aagenaes syndrome is a syndrome characterised by congenital hypoplasia of lymph vessels , which causes lymphedema of the legs and recurrent cholestasis in infancy, and slow progress to hepatic cirrhosis and giant cell hepatitis with fibrosis of the portal tracts . The genetic cause is unknown, but it is autosomal recessively inherited and the gene is located to chromosome .  

Prediction on Cancer: Disease 

Name of the disease: Cancer

Description of the disease: 

DRUG
The term encompasses any of a large variety of different anticancer drugs, which are divided into broad categories such as alkylating agents and antimetabolites . However, radiation and radioactive drugs are normally avoided during pregnancy, especially if the fetal dose might exceed cGy. Also, when chemotherapy is being given after birth, many of the drugs pass