In [14]:
from __future__ import print_function

import logging
from optparse import OptionParser
import sys
from time import time

import requests, json
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

Retrieve Remote Abstracts using EuropePMC Rest Services 
Disease Name - Asthma Result Type - Core (to get metadata information containing abstracts,title,etc) Result Format - JSON



In [39]:
abstract_list = list()
title_list = list()
for x in range(1, 10):
   
    europePMC_url = "http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=%s"%x
    print(europePMC_url)
    r = requests.get(europePMC_url)
    data = json.loads(r.content)
    for result in data['resultList']['result']:
    
        abstract = result.get('abstractText',None)
        title = result.get('title',None)
        if abstract is not None:
            abstract_list.append(abstract)
            #if title is not None :
            title_list.append(title)



http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=1
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=2
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=3
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=4
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=5
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=6
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=7
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=8
http://www.ebi.ac.uk/europepmc/webservices/rest/

In [40]:
      
print("%d abstracts" % len(abstract_list))
print("%d titles" % len(title_list))

8991 abstracts
8991 titles


Preprocess/Clean the abstract data using spaCy NLP
1) Tokenize the abstract data
2) Lemmatize the tokens
3) Clean the tokens by applying stoplists

In [41]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy import en
import string
from spacy.en import English

In [42]:
parser = English()

# A custom stoplist
STOPLIST = set()
STOPLIST.update(en.STOPWORDS)
STOPLIST.update(["n't", "'s", "'m", "ca","p"])
STOPLIST.update(list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...",  "'ve"] 
def tokenize_text(sample):

        # get the tokens using spaCy
        tokens = parser(sample)

        # lemmatize
        lemmas = []
        for tok in tokens:
            lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
            tokens = lemmas

        # stoplist the tokens
        tokens = [tok for tok in tokens if tok not in STOPLIST]

        # stoplist symbols
        tokens = [tok for tok in tokens if tok not in SYMBOLS]

        # remove large strings of whitespace
        while "" in tokens:
            tokens.remove("")
        while " " in tokens:
            tokens.remove(" ")
        while "\n" in tokens:
            tokens.remove("\n")
        while "\n\n" in tokens:
            tokens.remove("\n\n")
        #print tokens
        return tokens

Convert above training corpus to sparse vectors using a TF-IDF sparse vectorizer and extract relevant features
1) No. of features = 100000
2) Remove Ascii Characters from Training Corpus 

In [43]:
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, max_features=100000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii')
X = vectorizer.fit_transform(abstract_list)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)

done in 150.545696s
n_samples: 8991, n_features: 21369


Do the Actual Clustering using Scikit-Learn, KMeans Algo

In [44]:
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)
done in 1.747s



Print the top terms per cluster

In [45]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i, end='')
    print("Label %d:" %km.labels_[i],end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0:Label 1: insulin glucose hba1c 2 treatment hypoglycemia control therapy level group
Cluster 1:Label 2: diabetic cell rat mouse expression induce effect protein group increase
Cluster 2:Label 0: diabetic disease 2 group risk control level high mellitus associate
Cluster 3:Label 4: 95 ci risk woman gdm year pregnancy association associate high
Cluster 4:Label 4: care self health intervention management use control cost 2 group


In [46]:
import pandas as pd
clusters = km.labels_.tolist()
print("%d abstracts" % len(abstract_list))
abstract_data = { 'title': title_list, 'abstracts': abstract_list, 'cluster': clusters }

frame = pd.DataFrame(abstract_data, index = [clusters] , columns = [ 'title', 'cluster'])
frame['cluster'].value_counts() #number of terms per cluster (clusters from 0 to 4)

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(5):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
         print(' %s' % terms[ind], end='')
    print() #add whitespace
    print() #add whitespace
    
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace

8991 abstracts
Cluster 0 words: insulin glucose hba1c 2 treatment hypoglycemia control therapy level group

Cluster 0 titles: The global intellectual property ecosystem for insulin and its public health implications: an observational study., Diabetes, glycaemia, and cognition-a secondary analysis of the Finnish Diabetes Prevention Study., Insulin Use in Pregnancy: An Update., Clinical effects of liraglutide are possibly influenced by hypertriglyceridemia and remaining pancreatic β-cell function in subjects with type 2 diabetes mellitus., Medication-induced diabetes mellitus., Heterogeneity of Pre-diabetes and Type 2 Diabetes: Implications for Prediction, Prevention and Treatment Responsiveness., Should There be Concern About Autoimmune Diabetes in Adults? Current Evidence and Controversies., Hypoglycaemia in adults with insulin-treated diabetes in the UK: self-reported frequency and effects., The role of Insulin Pump Therapy in Type 2 Diabetes Mellitus., Sleep Apnea in Type 2 Diabetes.

Clustering using Mini Batch KMeans

In [47]:
km = MiniBatchKMeans(n_clusters=5, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=False)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=5,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)
done in 0.162s



In [48]:

for i in range(5):
    print("Cluster %d:" % i, end='')
    j=0
   
    for ind in order_centroids[i, :10]:
        if j==0:
            print("Label %s:" % terms[ind],end='')
        j = j+1
        print(' %s' % terms[ind], end='')
    print()

Cluster 0:Label insulin: insulin glucose hba1c 2 treatment hypoglycemia control therapy level group
Cluster 1:Label diabetic: diabetic cell rat mouse expression induce effect protein group increase
Cluster 2:Label diabetic: diabetic disease 2 group risk control level high mellitus associate
Cluster 3:Label 95: 95 ci risk woman gdm year pregnancy association associate high
Cluster 4:Label care: care self health intervention management use control cost 2 group


In [49]:
import pandas as pd
clusters = km.labels_.tolist()
abstract_data = { 'title': title_list, 'abstracts': abstract_list, 'cluster': clusters }

frame = pd.DataFrame(abstract_data, index = [clusters] , columns = [ 'title', 'cluster'])
frame['cluster'].value_counts() #number of terms per cluster (clusters from 0 to 4)

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(5):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
         print(' %s' % terms[ind], end='')
    print() #add whitespace
    print() #add whitespace
    
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace


Cluster 0 words: insulin care self management health use 2 intervention treatment glucose

Cluster 0 titles: The global intellectual property ecosystem for insulin and its public health implications: an observational study., Economic contributions of pharmaceutical interventions by pharmacists: a retrospective report in Japan., Assessment of current prescribing practices using World Health Organization core drug use and complementary indicators in selected rural community pharmacies in Southern India., Implementation of Preconception Care for Women With Diabetes., Diabetes Care and Research: What Should Be the Next Frontier?, Treatment urgency: The importance of getting people with type 2 diabetes to target promptly., Effect of Sleep Disturbances on Quality of Life, Diabetes Self-Care Behavior, and Patient-Reported Outcomes., The Challenges of Providing Diabetes Education in Resource-Limited Settings to Women With Diabetes in Pregnancy: Perspectives of an Educator., Insulin Use in Preg

Bi-gram clustering???

In [None]:
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, max_features=100000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii',ngram_range=(1,2))
X = vectorizer.fit_transform(abstract_list)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i, end='')
    print("Label %d:" %km.labels_[i],end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

In [None]:
import pandas as pd
clusters = km.labels_.tolist()
abstract_data = { 'title': title_list, 'abstracts': abstract_list, 'cluster': clusters }

frame = pd.DataFrame(abstract_data, index = [clusters] , columns = [ 'title', 'cluster'])
frame['cluster'].value_counts() #number of terms per cluster (clusters from 0 to 4)

order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(5):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :10]: 
         print(' %s' % terms[ind], end='')
    print() #add whitespace
    print() #add whitespace
    
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace

Calculate Relevance Score Of Documents using Euclidean distances

In [67]:
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5, max_features=100000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii',ngram_range=(1,2))
X = vectorizer.fit_transform(abstract_list)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
t0 = time()
clusters = km.fit_predict(X)
print(clusters.shape)
print("done in %0.3fs" % (time() - t0))
print()


# get all documents in cluster 0
cluster_0 = np.where(clusters==0) 

# cluster_0 now contains all indices of the documents in this cluster
X_cluster_0 = X[cluster_0]

D = euclidean_distances(X_cluster_0.getrow(0), km.cluster_centers_[0]) 
print(D)
D = euclidean_distances(X_cluster_0.getrow(1), km.cluster_centers_[0]) 
print(D)
D = euclidean_distances(X_cluster_0.getrow(2), km.cluster_centers_[0]) 
print(D)
D = euclidean_distances(term, km.cluster_centers_[0]) 

done in 143.384277s
n_samples: 7460, n_features: 100000
Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)
(7460,)
done in 1.354s

[[ 0.99212746]]
[[ 0.98951548]]
[[ 0.99121788]]


