In [10]:
from __future__ import print_function

import logging
from optparse import OptionParser
import sys
from time import time

import requests, json
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from IPython.core.display import display, HTML

Retrieve Remote Abstracts using EuropePMC Rest Services 
Disease Name - Diabetes
Result Type - Core (to get metadata information containing abstracts,title,etc)
Result Format - JSON



In [20]:
%%time

abstract_list = list()
for x in range(1, 10):
   
    europePMC_url = "http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=%s"%x
    print(europePMC_url)
    r = requests.get(europePMC_url)
    data = json.loads(r.content)
    for result in data['resultList']['result']:
    
        abstract = result.get('abstractText',None)
    
        if abstract is not None:
            abstract_list.append(abstract)
      


http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=1
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=2
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=3
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=4
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=5
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=6
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=7
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=1000&page=8
http://www.ebi.ac.uk/europepmc/webservices/rest/

Print Total number of abstracts retrieved

In [21]:
print("%d abstracts" % len(abstract_list))

7458 abstracts


Preprocess/Clean the abstract data using spaCy NLP
1) Tokenize the abstract data
2) Lemmatize the tokens
3) Clean the tokens by applying stoplists

In [22]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy import en
import string
from spacy.en import English

In [23]:
parser = English()

# A custom stoplist
STOPLIST = set()
STOPLIST.update(en.STOPWORDS)
STOPLIST.update(["n't", "'s", "'m", "ca","p"])
STOPLIST.update(list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...",  "'ve"] 
def tokenize_text(sample):

        # get the tokens using spaCy
        tokens = parser(sample)

        # lemmatize
        lemmas = []
        for tok in tokens:
            lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
            tokens = lemmas

        # stoplist the tokens
        tokens = [tok for tok in tokens if tok not in STOPLIST]

        # stoplist symbols
        tokens = [tok for tok in tokens if tok not in SYMBOLS]

        # remove large strings of whitespace
        while "" in tokens:
            tokens.remove("")
        while " " in tokens:
            tokens.remove(" ")
        while "\n" in tokens:
            tokens.remove("\n")
        while "\n\n" in tokens:
            tokens.remove("\n\n")
        #print tokens
        return tokens

Convert above training corpus to sparse vectors using a TF-IDF sparse vectorizer and extract relevant features
1) No. of features = 100000
2) Remove Ascii Characters from Training Corpus 

In [24]:
%%time

vectorizer = TfidfVectorizer(max_df=0.5, max_features=100000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii')
X = vectorizer.fit_transform(abstract_list)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 7458, n_features: 19369
CPU times: user 2min 12s, sys: 301 ms, total: 2min 13s
Wall time: 2min 13s


Do the Actual Clustering using Scikit-Learn, KMeans Algo

In [25]:
%%time

km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

CPU times: user 1.35 s, sys: 23.6 ms, total: 1.37 s
Wall time: 1.37 s


Print the top terms per cluster

In [26]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i, end='')
    print("Label %d:" %km.labels_[i],end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0:Label 3: diabetic rat group retinopathy induce level complication dr control increase
Cluster 1:Label 4: risk 95 ci woman gdm year high age prevalence associate
Cluster 2:Label 3: care self health intervention management use cost control adherence 2
Cluster 3:Label 3: insulin glucose 2 treatment level control hba1c t2dm disease group
Cluster 4:Label 2: cell mouse insulin islet expression -cell beta protein glucose induce


Clustering using Mini Batch KMeans

In [28]:
%%time

km = MiniBatchKMeans(n_clusters=5, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=False)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=5,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)


  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_sample


CPU times: user 226 ms, sys: 5.04 ms, total: 231 ms
Wall time: 228 ms


  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)


In [29]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i, end='')
    print("Label %d:" %km.labels_[i],end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0:Label 2: metformin hba1c treatment insulin placebo trial inhibitor therapy 2 drug
Cluster 1:Label 2: diabetic cell mouse level rat group expression induce glucose increase
Cluster 2:Label 2: insulin care glucose gdm use 2 health control 1 management
Cluster 3:Label 2: risk 95 ci year age high prevalence associate factor 2
Cluster 4:Label 2: sleep quality osa apnea duration life 2 associate poor risk


Bi-gram clustering???

In [None]:
%%time

vectorizer = TfidfVectorizer(max_df=0.5, max_features=100000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii',ngram_range=(1,2))
X = vectorizer.fit_transform(abstract_list)

print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i, end='')
    print("Label %d:" %km.labels_[i],end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

In [11]:
dataJSON = [{ 'label': "Care", 'groups': [{ 'label':"self" },{ 'label':"health" },{ 'label':"intervention" },{ 'label':"adherence" }], 'weight': 1.0 },
            { 'label': "Ci", 'groups': [{ 'label':"95" },{ 'label':"risk" },{ 'label':"gdm" },{ 'label':"woman" },{ 'label':"gestational" },{ 'label':"pregnancy" },{ 'label':"hr" },{ 'label':"associate" }], 'weight': 1.0 },
            { 'label': "Cell", 'groups': [{ 'label':"mouse" },{ 'label':"rat" },{ 'label':"insulin" },{ 'label':"diabetic" },{ 'label':"expression" },{ 'label':"induce" },{ 'label':"protein" },{ 'label':"glucose" },{ 'label':"islet" }], 'weight': 1.0 },
            { 'label': "Risk", 'groups': [{ 'label':"diesase" },{ 'label':"diabetic" },{ 'label':"high" },{ 'label':"age" },{ 'label':"associate" },{ 'label':"factor" }],'weight': 1.0 },
            { 'label': "Insulin", 'groups': [{ 'label':"glucose" },{ 'label':"group" },{ 'label':"control" },{ 'label':"level" },{ 'label':"weight" },{ 'label':"treatment" }],'weight': 1.0 }
           ]
            

In [12]:
display(HTML("""
<!DOCTYPE html>
<html>
  <head>
    <title>FoamTree Quick Start</title>
    <meta charset="utf-8" />
  </head>

  <body>
    <div id="visualization" style="width: 950px; height: 600px"></div>

    <script src="carrotsearch.foamtree.js"></script>
    <script>
      function init() {
        var foamtree = new CarrotSearchFoamTree({
          id: "visualization",
          dataObject: {
            groups:"""
             + 
             json.dumps(dataJSON)
             +
             """
          }
        });
      }
        
    init();
    </script>
  </body>
 """))
