In [41]:
from __future__ import print_function

import logging
from optparse import OptionParser
import sys
from time import time

import requests, json
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from IPython.core.display import display, HTML

Retrieve Remote Abstracts using EuropePMC Rest Services (This is a one-time activity and can be commented after abstracts are stored in local DB)
Disease Name - Diabetes
Result Type - Core (to get metadata information containing abstracts,title,etc)
Result Format - JSON



In [43]:
%%time

temp_abstract_list = list()
for x in range(1, 8):
   
    europePMC_url = "http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=asthma&format=json&resulttype=core&pageSize=1000&page=%s"%x
    print(europePMC_url)
    r = requests.get(europePMC_url)
    data = json.loads(r.content)
    for result in data['resultList']['result']:
    
        abstract = result.get('abstractText',None)
    
        if abstract is not None:
            temp_abstract_list.append(abstract)
      


http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=asthma&format=json&resulttype=core&pageSize=1000&page=1
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=asthma&format=json&resulttype=core&pageSize=1000&page=2
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=asthma&format=json&resulttype=core&pageSize=1000&page=3
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=asthma&format=json&resulttype=core&pageSize=1000&page=4
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=asthma&format=json&resulttype=core&pageSize=1000&page=5
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=asthma&format=json&resulttype=core&pageSize=1000&page=6
http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=asthma&format=json&resulttype=core&pageSize=1000&page=7
CPU times: user 2.23 s, sys: 504 ms, total: 2.73 s
Wall time: 2min 34s


Print Total number of abstracts retrieved

In [93]:
print("%d abstracts" % len(temp_abstract_list))

6995 abstracts


Store the abstracts using sqlite3 (lightweight disk-based database) - ONE TIME ACTIVITY

In [94]:
import sqlite3
conn = sqlite3.connect('abstract.db')
c = conn.cursor()
disease_name = "asthma"

# Uncomment below to Create table
c.execute('''CREATE TABLE abstracts
             (abstract text, disease text)''')
# Uncomment below to insert abstracts specific to a disease to local db table
for abstract in temp_abstract_list:
     c.execute("INSERT INTO abstracts VALUES (?,?)",(abstract,disease_name))
# Save (commit) the changes
conn.commit()




Retrieve abstracts from local DB

In [95]:
disease_name = ("asthma",)

c.execute("SELECT abstract FROM abstracts WHERE disease= ?", disease_name)
abstract_list = c.fetchall()
# db returns a list of tuples, convert to list of strings as shown below
abstract_list = [i for (i,) in abstract_list]
print("%d abstracts from local db" % len(abstract_list))

6995 abstracts from local db


Preprocess/Clean the abstract data using spaCy NLP
1) Tokenize the abstract data
2) Lemmatize the tokens
3) Clean the tokens by applying stoplists

In [96]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy import en
import string
from spacy.en import English

In [97]:
parser = English()

# A custom stoplist
STOPLIST = set()
STOPLIST.update(en.STOPWORDS)
STOPLIST.update(["n't", "'s", "'m", "ca","p", "t"])
STOPLIST.update(list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...",  "'ve"] 
def tokenize_text(sample):

        # get the tokens using spaCy
        tokens = parser(sample)

        # lemmatize
        lemmas = []
        for tok in tokens:
            lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
            tokens = lemmas

        # stoplist the tokens
        tokens = [tok for tok in tokens if tok not in STOPLIST]

        # stoplist symbols
        tokens = [tok for tok in tokens if tok not in SYMBOLS]

        # remove large strings of whitespace
        while "" in tokens:
            tokens.remove("")
        while " " in tokens:
            tokens.remove(" ")
        while "\n" in tokens:
            tokens.remove("\n")
        while "\n\n" in tokens:
            tokens.remove("\n\n")
        #print tokens
        return tokens

Convert above training corpus to sparse vectors using a TF-IDF sparse vectorizer and extract relevant features
1) No. of features = 100000
2) Remove Ascii Characters from Training Corpus 

In [98]:
%%time

vectorizer = TfidfVectorizer(max_df=0.5, max_features=100000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii')
X = vectorizer.fit_transform(abstract_list)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 6995, n_features: 19534
CPU times: user 2min 9s, sys: 246 ms, total: 2min 10s
Wall time: 2min 10s


Do the Actual Clustering using Scikit-Learn, KMeans Algo

In [99]:
%%time

km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

CPU times: user 3.03 s, sys: 63.5 ms, total: 3.1 s
Wall time: 844 ms


Persist the training model using joblib.dump 'pickling'

In [100]:
#from sklearn.externals import joblib
# save the model
#joblib.dump(km,  'diabetes_doc_cluster.pkl')

# loading from the pickle
#km = joblib.load('diabetes_doc_cluster.pkl')

Print the top terms per cluster

In [101]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
    j=0
    print("Cluster %d:" % i, end='')
    
    for ind in order_centroids[i, :10]:
        if j==0:
            print("Label %s:" % terms[ind],end='')
        j=j+1
        print(' %s' % terms[ind], end=',')
    print()

Cluster 0:Label patient: patient, disease, group, treatment, control, clinical, case, symptom, level, high,
Cluster 1:Label copd: copd, patient, acos, disease, exacerbation, obstructive, pulmonary, chronic, lung, smoker,
Cluster 2:Label cell: cell, mouse, airway, expression, inflammation, lung, inflammatory, induce, t, response,
Cluster 3:Label child: child, 95, ci, risk, age, year, associate, association, exposure, prevalence,
Cluster 4:Label health: health, care, patient, intervention, cost, child, adherence, condition, management, disease,


Clustering using Mini Batch KMeans

In [105]:
%%time
import warnings
km = MiniBatchKMeans(n_clusters=5, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=False)
print("Clustering sparse data with %s" % km)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    km.fit(X)
print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=5,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)

CPU times: user 168 ms, sys: 2.06 ms, total: 170 ms
Wall time: 169 ms


FoamTree Display of clusters

In [106]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

cluster_data_JSON = []
for i in range(5):
    j=0
    print("Cluster %d:" % i, end='')
    cluster_dict = {}
    cluster_dict["label"] = str(km.labels_[i])
    groups_list = []
    for ind in order_centroids[i, :10]:
        if j==0:
            print("Label %s:" % terms[ind],end='')
            cluster_dict["label"] = terms[ind]
        j = j+1
        print(' %s' % terms[ind], end=',')
        group_dict = {}
        group_dict["label"] = str(terms[ind])
        groups_list.append(group_dict)
    cluster_dict["groups"] = groups_list  
    cluster_data_JSON.append(cluster_dict)
    print()

Cluster 0:Label cell: cell, airway, mouse, expression, inflammation, lung, t, inflammatory, induce, cytokine,
Cluster 1:Label child: child, health, care, intervention, year, age, patient, ed, visit, school,
Cluster 2:Label allergic: allergic, allergy, allergen, exposure, child, rhinitis, ige, food, risk, 95,
Cluster 3:Label disease: disease, patient, group, treatment, control, clinical, effect, level, case, gene,
Cluster 4:Label patient: patient, copd, ci, 95, disease, control, group, year, treatment, risk,


Javascript code for displaying cluster data(Mini-Batch KMeans ) in foamtree format. 

In [104]:
display(HTML("""
<!DOCTYPE html>
<html>
  <head>
    <title>FoamTree Quick Start</title>
    <meta charset="utf-8" />
  </head>

  <body>
    <div id="visualization" style="width: 950px; height: 600px"></div>

    <script src="carrotsearch.foamtree.js"></script>
    <script>
      function init() {
        var foamtree = new CarrotSearchFoamTree({
          id: "visualization",
          dataObject: {
            groups:"""
             + 
             json.dumps(cluster_data_JSON)
             +
             """
          }
        });
      }
        
    init();
    </script>
  </body>
 """))


Bi-gram clustering???

In [89]:
%%time

vectorizer = TfidfVectorizer(max_df=0.5, max_features=100000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii',ngram_range=(1,2))
X = vectorizer.fit_transform(abstract_list)

print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
    j=0
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        if j==0:
            print("Label %s:" % terms[ind],end='')
        j = j+1
        print(' %s' % terms[ind], end=',')
    print()

n_samples: 13990, n_features: 100000
Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

Cluster 0:Label health: health, care, child, patient, intervention, year, cost, age, disease, condition,
Cluster 1:Label child: child, ci, 95, 95 ci, allergy, allergic, risk, food, exposure, association,
Cluster 2:Label patient: patient, group, treatment, disease, control, clinical, case, level, high, symptom,
Cluster 3:Label copd: copd, patient, acos, disease, asthma copd, obstructive, exacerbation, pulmonary, chronic, chronic obstructive,
Cluster 4:Label cell: cell, mouse, airway, expression, inflammation, lung, induce, inflammatory, t, response,
CPU times: user 4min 43s, sys: 917 ms, total: 4min 44s
Wall time: 4min 39s


In [None]:
# We can also close the connection if we are done with it.
# Just be sure any changes have been committed or they will be lost.
conn.close()