In [1]:
from __future__ import print_function

import logging
from optparse import OptionParser
import sys
from time import time

import requests, json
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from IPython.core.display import display, HTML

### Retrieve Remote Abstracts using EuropePMC Rest Services
*(This is a one-time activity and can be commented after abstracts are stored in local DB)*

* Disease Name - IBD (Inflammatory Bowel Disease)
* Result Type - Core (to get metadata information containing abstracts,title,etc)
* Result Format - JSON

Store the abstracts using **sqlite3** (lightweight disk-based database).


In [3]:
%%time
import sqlite3
conn = sqlite3.connect('abstract.db')
c = conn.cursor()
disease_name = "ibd"
temp_abstract_list = list()
# # Uncomment below to Create table
#c.execute('''CREATE TABLE abstract_data
#            (abstract text, disease text, title text, pmid text)''')

for x in range(1, 2):
   
    europePMC_url = "http://www.ebi.ac.uk/europepmc/webservices/rest/search?query={}&format=json&resulttype=core&pageSize=500&page={}".format(disease_name,x)
    print(europePMC_url)
    r = requests.get(europePMC_url)
    data = json.loads(r.content)
    for result in data['resultList']['result']:
    
        abstract = result.get('abstractText',None)
        title = result.get('title',None)
        pmid = result.get('pmid',None)
    
        if abstract is not None:
            temp_abstract_list.append(abstract)
            
            #c.execute("INSERT INTO abstract_data VALUES (?,?,?,?)",(abstract,disease_name,title,pmid))  
    #conn.commit()


http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=ibd&format=json&resulttype=core&pageSize=500&page=1
CPU times: user 126 ms, sys: 45.2 ms, total: 171 ms
Wall time: 28.6 s


Retrieve abstracts from local DB

In [4]:
disease_name = ("ibd",)

c.execute("SELECT abstract FROM abstract_data WHERE disease= ?", disease_name)
abstract_list = c.fetchall()
# db returns a list of tuples, convert to list of strings as shown below
abstract_list = [i for (i,) in abstract_list]
print("%d abstracts from local db" % len(abstract_list))


18255 abstracts from local db


Preprocess/Clean the abstract data using **spaCy NLP**
1. Tokenize the abstract data
2. Lemmatize the tokens
3. Clean the tokens by applying stoplists
4. *Deal with **target/disease synonymns** here???*

In [5]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy import en
import string
from spacy.en import English

In [6]:
parser = English()

# A custom stoplist
STOPLIST = set()
STOPLIST.update(en.STOPWORDS)
STOPLIST.update(["n't", "'s", "'m", "ca","p", "t"])
STOPLIST.update(list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...",  "'ve"] 
def tokenize_text(sample):

        # get the tokens using spaCy
        tokens = parser(sample)

        # lemmatize
        lemmas = []
        for tok in tokens:
            lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
            tokens = lemmas

        # stoplist the tokens
        tokens = [tok for tok in tokens if tok not in STOPLIST]

        # stoplist symbols
        tokens = [tok for tok in tokens if tok not in SYMBOLS]

        # remove large strings of whitespace
        while "" in tokens:
            tokens.remove("")
        while " " in tokens:
            tokens.remove(" ")
        while "\n" in tokens:
            tokens.remove("\n")
        while "\n\n" in tokens:
            tokens.remove("\n\n")
        #print tokens
        return tokens

Convert above training corpus to *sparse vectors* using a **TF-IDF** sparse vectorizer and extract relevant features

 Normalizing and weighting with diminishing importance tokens that occur in the majority of samples / documents.
1. No. of features = 100000
2. Remove Ascii Characters from Training Corpus 

In [11]:
%%time

vectorizer = TfidfVectorizer(max_df=0.5, max_features=800000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii')
X = vectorizer.fit_transform(abstract_list)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 18255, n_features: 29905
CPU times: user 5min 7s, sys: 815 ms, total: 5min 7s
Wall time: 5min 8s


Do the Actual **Clustering** using **Scikit-Learn, KMeans Algo**

In [12]:
%%time

km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()

Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

CPU times: user 7.02 s, sys: 268 ms, total: 7.29 s
Wall time: 2.13 s


~~Persist the training model using joblib.dump 'pickling'~~

In [13]:
#from sklearn.externals import joblib
# save the model
#joblib.dump(km,  'diabetes_doc_cluster.pkl')

# loading from the pickle
#km = joblib.load('diabetes_doc_cluster.pkl')

Print the top terms per cluster and display in foamtree format

In [14]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

cluster_data_JSON = []
for i in range(5):
    j=0
    print("Cluster %d:" % i, end='')
    cluster_dict = {}
    cluster_dict["label"] = str(km.labels_[i])
    groups_list = []
    for ind in order_centroids[i, :10]:
        if j==0:
            print("Label %s:" % terms[ind],end='')
            cluster_dict["label"] = terms[ind]
        j = j+1
        print(' %s' % terms[ind], end=',')
        group_dict = {}
        group_dict["label"] = str(terms[ind])
        groups_list.append(group_dict)
    cluster_dict["groups"] = groups_list  
    cluster_data_JSON.append(cluster_dict)
    print()

Cluster 0:Label risk: risk, treatment, use, therapy, clinical, ibd patient, group, year, diagnosis, case,
Cluster 1:Label cell: cell, mouse, expression, colitis, induce, intestinal, inflammation, cytokine, colon, ds,
Cluster 2:Label microbiota: microbiota, immune, gut, intestinal, cell, role, host, inflammation, human, response,
Cluster 3:Label genetic: genetic, population, gene, snp, locus, genome, variant, association, analysis, region,
Cluster 4:Label cd: cd, uc, control, disease cd, cd patient, colitis uc, crohn, crohn disease, uc patient, colitis,


In [15]:
display(HTML("""
<!DOCTYPE html>
<html>
  <head>
    <title>FoamTree Quick Start</title>
    <meta charset="utf-8" />
  </head>

  <body>
    <div id="visualization_kmeans" style="width: 950px; height: 600px"></div>

    <script src="carrotsearch.foamtree.js"></script>
    <script>
      function init() {
        var foamtree = new CarrotSearchFoamTree({
          id: "visualization_kmeans",
          dataObject: {
            groups:"""
             + 
             json.dumps(cluster_data_JSON)
             +
             """
          }
        });
      }
        
    init();
    </script>
  </body>
 """))


Clustering using **Mini Batch KMeans**

In [21]:
%%time
import warnings
km = MiniBatchKMeans(n_clusters=5, init='k-means++', n_init=1,
                         init_size=1000, batch_size=10000, verbose=False)
print("Clustering sparse data with %s" % km)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    km.fit(X)
print()

Clustering sparse data with MiniBatchKMeans(batch_size=10000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=5,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)

CPU times: user 1.83 s, sys: 214 ms, total: 2.05 s
Wall time: 2.08 s


**FoamTree Display** of clusters

In [22]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

cluster_data_JSON = []
for i in range(5):
    j=0
    print("Cluster %d:" % i, end='')
    cluster_dict = {}
    cluster_dict["label"] = str(km.labels_[i])
    groups_list = []
    for ind in order_centroids[i, :10]:
        if j==0:
            print("Label %s:" % terms[ind],end='')
            cluster_dict["label"] = terms[ind]
        j = j+1
        print(' %s' % terms[ind], end=',')
        group_dict = {}
        group_dict["label"] = str(terms[ind])
        groups_list.append(group_dict)
    cluster_dict["groups"] = groups_list  
    cluster_data_JSON.append(cluster_dict)
    print()

Cluster 0:Label cell: cell, mouse, intestinal, expression, induce, colitis, inflammation, immune, cytokine, epithelial,
Cluster 1:Label child: child, care, use, ibs, symptom, quality, group, ibd patient, iron, health,
Cluster 2:Label genetic: genetic, gene, population, variant, genome, locus, snp, association, analysis, region,
Cluster 3:Label therapy: therapy, treatment, drug, anti, clinical, microbiota, use, review, cancer, gut,
Cluster 4:Label cd: cd, uc, 95, ci, risk, 95 ci, control, year, crohn, colitis,


Javascript code for displaying cluster data(Mini-Batch KMeans ) in foamtree format. 

In [23]:
display(HTML("""
<!DOCTYPE html>
<html>
  <head>
    <title>FoamTree Quick Start</title>
    <meta charset="utf-8" />
  </head>

  <body>
    <div id="visualization" style="width: 950px; height: 600px"></div>

    <script src="carrotsearch.foamtree.js"></script>
    <script>
      function init() {
        var foamtree = new CarrotSearchFoamTree({
          id: "visualization",
          dataObject: {
            groups:"""
             + 
             json.dumps(cluster_data_JSON)
             +
             """
          }
        });
      }
        
    init();
    </script>
  </body>
 """))


### Bi-gram clustering

In text classification, Unigrams are single words, **Bigrams are two related words (appear frequently next to each       other in text)**, and Trigram is just the next extension of that concept.Often considering Bigrams in a                 classification algorithm tends to really **boost performance**, since the increased long-tail specificity of the word     means that the classifier can easily determine which class has a higher probability, leading to better                 classifications. **Trigrams do not have offer the same boost as Bigrams**, but they are worth considering and could be     essential for certain types of classifiers. You could also go beyond Trigrams if you felt that the classification     problem requires it.The important thing to remember here is to apply the same logic for eliminating low quality       bigrams and trigrams as you would with unigrams.

In [11]:
%%time

vectorizer = TfidfVectorizer(max_df=0.5, max_features=800000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii',ngram_range=(1,2))
X = vectorizer.fit_transform(abstract_list)

print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()


n_samples: 18255, n_features: 261649
Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

CPU times: user 6min 14s, sys: 5.8 s, total: 6min 20s
Wall time: 6min 8s


In [26]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

bigram_cluster_data_JSON = []
for i in range(5):
    j=0
    print("Cluster %d:" % i, end='')
    cluster_dict = {}
    cluster_dict["label"] = str(km.labels_[i])
    groups_list = []
    for ind in order_centroids[i, :10]:
        if j==0:
            print("Label %s:" % terms[ind],end='')
            cluster_dict["label"] = terms[ind]
        j = j+1
        print(' %s' % terms[ind], end=',')
        group_dict = {}
        group_dict["label"] = str(terms[ind])
        groups_list.append(group_dict)
    cluster_dict["groups"] = groups_list  
    bigram_cluster_data_JSON.append(cluster_dict)
    print()

Cluster 0:Label cell: cell, mouse, intestinal, expression, immune, inflammation, induce, colitis, cytokine, response,
Cluster 1:Label genetic: genetic, population, gene, snp, locus, genome, variant, association, analysis, region,
Cluster 2:Label therapy: therapy, tnf, anti, anti tnf, treatment, infliximab, remission, drug, clinical, pregnancy,
Cluster 3:Label cd: cd, uc, 95, ci, 95 ci, risk, control, year, crohn, crohn disease,
Cluster 4:Label use: use, treatment, clinical, risk, group, review, colitis, cancer, disease ibd, diagnosis,


In [49]:
display(HTML("""
<!DOCTYPE html>
<html>
  

  <body>
    <div id="visualization1" style="width: 950px; height: 600px"></div>

    <script src="carrotsearch.foamtree.js"></script>
    <script>
      function init() {
        var foamtree1 = new CarrotSearchFoamTree({
          id: "visualization1",
          dataObject: {
            groups:"""
             + 
             json.dumps(bigram_cluster_data_JSON)
             +
             """
          }
        });
      }
        
    init();
    </script>
  </body>
 """))


### Trigram Clustering example

In [7]:
%%time

vectorizer = TfidfVectorizer(max_df=0.5, max_features=800000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii',ngram_range=(1,3))
X = vectorizer.fit_transform(abstract_list)

print("n_samples: %d, n_features: %d" % X.shape)
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()



n_samples: 18255, n_features: 395064
Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=False)

CPU times: user 6min 31s, sys: 4.79 s, total: 6min 36s
Wall time: 6min 25s


In [9]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

trigram_cluster_data_JSON = []
for i in range(5):
    j=0
    print("Cluster %d:" % i, end='')
    cluster_dict = {}
    cluster_dict["label"] = str(km.labels_[i])
    groups_list = []
    for ind in order_centroids[i, :10]:
        if j==0:
            print("Label %s:" % terms[ind],end='')
            cluster_dict["label"] = terms[ind]
        j = j+1
        print(' %s' % terms[ind], end=',')
        group_dict = {}
        group_dict["label"] = str(terms[ind])
        groups_list.append(group_dict)
    cluster_dict["groups"] = groups_list  
    trigram_cluster_data_JSON.append(cluster_dict)
    print()


Cluster 0:Label cd: cd, uc, 95, ci, 95 ci, risk, control, year, crohn, crohn disease,
Cluster 1:Label cell: cell, mouse, intestinal, expression, inflammation, induce, immune, colitis, cytokine, response,
Cluster 2:Label use: use, treatment, clinical, risk, group, microbiota, review, diagnosis, disease ibd, cancer,
Cluster 3:Label therapy: therapy, tnf, anti, anti tnf, infliximab, treatment, remission, drug, clinical, iron,
Cluster 4:Label genetic: genetic, population, gene, snp, locus, genome, variant, association, analysis, region,


In [10]:
display(HTML("""
<!DOCTYPE html>
<html>
  

  <body>
    <div id="visualization2" style="width: 950px; height: 600px"></div>

    <script src="carrotsearch.foamtree.js"></script>
    <script>
      function init() {
        var foamtree1 = new CarrotSearchFoamTree({
          id: "visualization2",
          dataObject: {
            groups:"""
             + 
             json.dumps(trigram_cluster_data_JSON)
             +
             """
          }
        });
      }
        
    init();
    </script>
  </body>
 """))


In [None]:
# We can also close the connection if we are done with it.
# Just be sure any changes have been committed or they will be lost.
conn.close()