In [1]:
from __future__ import print_function

import logging
import sys
from time import time

import requests, json
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer

from IPython.core.display import display, HTML

### Retrieve Remote Abstracts using EuropePMC Rest Services
*(This is a one-time activity and can be commented after abstracts are stored in local DB)*

* Disease Name - Diabetes
* Result Type - Core (to get metadata information containing abstracts,title,etc)
* Result Format - JSON


In [19]:
%%time

abstract_list = list()
titles_list = list()
pmid_list = list()
for x in range(1, 2):
   
    europePMC_url = "http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=100&page=%s"%x
    print(europePMC_url)
    r = requests.get(europePMC_url)
    data = json.loads(r.content)
    for result in data['resultList']['result']:
    
        abstract = result.get('abstractText',None)
        title = result.get('title',None)
        pmid = result.get('pmid',None)
        if abstract is not None:
            abstract_list.append(abstract)
            titles_list.append(abstract)
            pmid_list.append(pmid)
            
            #temp_list = list()
            #temp_list.append(abstract)
            #temp_list.append(title)
            #abstract_details_list.append(temp_list)

http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=diabetes&format=json&resulttype=core&pageSize=100&page=1
CPU times: user 127 ms, sys: 6.93 ms, total: 134 ms
Wall time: 2.18 s


In [20]:
print("%d abstracts" % len(abstract_list))
print("%d titles" % len(titles_list))
print("%d pmids" % len(pmid_list))

74 abstracts
74 titles
74 pmids


Preprocess/Clean the abstract data using **spaCy NLP**
1. Tokenize the abstract data
2. Lemmatize the tokens
3. Clean the tokens by applying stoplists

In [21]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from spacy import en
import string
from spacy.en import English

In [22]:
parser = English()

# A custom stoplist
STOPLIST = set()
STOPLIST.update(en.STOPWORDS)
STOPLIST.update(["n't", "'s", "'m", "ca","p", "t"])
STOPLIST.update(list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...",  "'ve"] 
def tokenize_text(sample):

        # get the tokens using spaCy
        tokens = parser(sample)

        # lemmatize
        lemmas = []
        for tok in tokens:
            lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
            tokens = lemmas

        # stoplist the tokens
        tokens = [tok for tok in tokens if tok not in STOPLIST]

        # stoplist symbols
        tokens = [tok for tok in tokens if tok not in SYMBOLS]

        # remove large strings of whitespace
        while "" in tokens:
            tokens.remove("")
        while " " in tokens:
            tokens.remove(" ")
        while "\n" in tokens:
            tokens.remove("\n")
        while "\n\n" in tokens:
            tokens.remove("\n\n")
        #print tokens
        return tokens

Convert above training corpus to *sparse vectors* using a **TF-IDF** sparse vectorizer and extract relevant features
1. No. of features = 100000
2. Remove Ascii Characters from Training Corpus 

In [23]:
%%time

vectorizer = TfidfVectorizer(max_df=0.5, max_features=100000,
                                 min_df=2,tokenizer=tokenize_text, stop_words='english',
                                 use_idf=True,strip_accents =  'ascii')
X = vectorizer.fit_transform(abstract_list)

print("n_samples: %d, n_features: %d" % X.shape)


n_samples: 74, n_features: 965
CPU times: user 1.53 s, sys: 6.3 ms, total: 1.53 s
Wall time: 1.53 s


terms - **vocabulary**, a list of the features used in the *tf-idf matrix*.

In [24]:
terms = vectorizer.get_feature_names()

dist : **1 - the cosine similarity of each document.**

  Cosine similarity is measured against the tf-idf matrix and can be used   to generate a measure of similarity between each document and the other   documents in the corpus. Subtracting it from 1 provides cosine distance   which will be used for plotting on a euclidean (2-dimensional) plane.
  Note that with dist it is possible to evaluate the similarity of any two   or more abstracts.

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(X)
print
print

<function print>

##Hierarchial Clustering using Ward Clustering Algorithm.

  Ward clustering is an **agglomerative clustering method**, meaning that   at each stage, the pair of clusters with minimum between-cluster           distance are merged. 
  The precomputed cosine distance matrix viz. dist is used to calculate a   **linkage_matrix**.
  
  >In case you encounter following error after installing matplotlib package 
  >RuntimeError: Python is not installed as a framework....
  
  >Solution: 
   After installing with pip matplotlib, there is a directory in     root     called ~/.matplotlib.
   Create a file ~/.matplotlib/matplotlibrc there and add the following       code: backend: TkAgg

In [26]:
from scipy.cluster.hierarchy import ward, dendrogram
import matplotlib.pyplot as plt


linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=pmid_list);

plt.tight_layout() #show plot with tight layout
#plt.show()

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters
print

<function print>

In [27]:
plt.close()