In [1]:
from Bio import Entrez
from Bio.Entrez import efetch
# must install biopython package
# pip install biopython

import xml.etree.cElementTree as ET
import codecs

import os
import codecs
import sys  
import string
import unicodedata
import xml.etree.cElementTree as ET
import codecs
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from Bio import Entrez
from Bio.Entrez import efetch


#from AutoQuery import GetQuery
#from GetAbstract import ProccessQuery
#from FeatureExtraction import TokenStemmer

In [2]:
# returns a list of citation IDs in ['IdList] using esearch utility
def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='1000', # current limit is 200, could download more if done in blocks
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    handle.close()
    return results



In [3]:
# uses the id_list to query pubmed using efetch utility
def fetch_abstract(id_list):
    ids = ','.join(id_list)
    handle = efetch(db='pubmed', id=ids, retmode='xml') ### this step returns xml object
    #results = Entrez.read(handle) #this step read xml to string
    #return results
    return handle

In [4]:
def GetQuery(term):
    #arbitrary search term for now (incorporate into front end?)
    results = search(term)
    id_list = results['IdList']
    papers = fetch_abstract(id_list)

    return papers ##returns a xml object, which is taken by the parser as input



In [5]:
##read file to the tree
def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()

##extract ArticleTitle, Abstract,Author information into a list of dictionary
def get_abstracts(root):
    collection = []
    for article in root.findall("./PubmedArticle/MedlineCitation/Article"):
        data = {
                "ArticleTitle": None,
                "Abstract": None, 
                "Author":None,
        } 
        
        
        ##check the exsitence of those tags for each item, pass if it's None
        check= article.find('./ArticleTitle')==None or article.findall('./Abstract/AbstractText')==None or article.find('./AuthorList/Author/LastName')==None or article.find('./AuthorList/Author/ForeName')==None            
        if check==True:
            pass
        else:
            
            
            ##extract title and author as string
            data["ArticleTitle"] = article.find('./ArticleTitle').text
            data["Author"] = article.find('./AuthorList/Author/LastName').text +' '+ article.find('./AuthorList/Author/ForeName').text
                      
            
            ##deal with multiple segments of AbstractText
            abstract=''
            abtext=article.findall('./Abstract/AbstractText')
            
            
            
            for text in abtext:
                if text.text:
                    abstract=abstract+' '+text.text
            data["Abstract"]=abstract
            
            #add the current entry to collection
            collection.append(data)
    
    return collection

    

##call the helper function, returns data as list of dictionary
def ProccessQuery(filename):
    root = get_root(filename)
    data = get_abstracts(root)
    return data
    

In [6]:
##create two stopword lists
def swansonStoplist():
    with codecs.open('test-query-to-feature/stopwords_swanson.txt','r', encoding='utf-8') as f:
        swanson = f.readlines()
    swanson = [x.strip() for x in swanson] 
    return swanson

def pubmedStoplist():    
    with codecs.open('test-query-to-feature/stopwords_pubmed.txt','r', encoding='utf-8') as f:
        pubmed = f.readlines()
    pubmed = [x.strip() for x in pubmed]     
    return pubmed





##tokenize and stem unicode string, returns a clean unicode string
def TokenStemmer(f):
                
    new_content=word_tokenize(f)
    
    #stemmer and stopwords
    stemmer = SnowballStemmer("english",ignore_stopwords=False)    
    swansonstopWords=stopwords.words('english')+swansonStoplist()
    pubmedstopWords=stopwords.words('english')+pubmedStoplist()
        

    ##iterate through content word list, skip pubmed stop words and do stemming  
    words=''
    for i in new_content:
        if i not in pubmedstopWords:
            new=stemmer.stem(i)
            words=words+' '+new
    return words

In [7]:
def fileparser(term):
    
    xml_object = GetQuery(term)
    sample=ProccessQuery(xml_object)

    feature_list=[]     
    for i in sample:
        if i['Abstract']!=None: #make sure not include missing/bad data point
            content=i['Abstract']
            #translate to string type for punctuation removal
            text_string = content.encode('utf-8').translate(None, string.punctuation)
            #back to unicode
            Utext_string=unicode(text_string, "utf-8")
            #call the TokenStemmer() from FeatureExtraction.py
            text = TokenStemmer(Utext_string)
            #change again back to ascii string type and split into word list
            #text=unicodedata.normalize('NFKD', text).encode('ascii','ignore')
            #change again back to ascii string type and split into word list
            features=text.encode('ascii','ignore')
            
            ##update value of 'Abstract' to the feature list
            i['Abstract']=features
            feature_list.append(i)
    return feature_list 
            
            
if __name__ == '__main__':
    sample_query1='((tumor) AND "Cell"[Journal])'
    sample_query2='immunotherapy'
    sample_query3='(HDAC) AND immunotherapy' 
    ablist=fileparser(sample_query1)

In [8]:
ablist

[{'Abstract': ' intratumor heterogen foster tumor evolut key challeng cancer medicin here review data technolog reveal intratumor heterogen cancer type dynam constraint conting inher tumor evolut we emphas macroevolutionari leap involv largescal chromosom alter drive tumor evolut metastasi consid role tumor microenviron engend heterogen drug resist we suggest bold approach drug develop har adapt properti immunemicroenviron limit tumor combin advanc clinic trialdesign improv patient outcom',
  'ArticleTitle': 'Clonal Heterogeneity and Tumor Evolution: Past, Present, and the Future.',
  'Author': 'McGranahan Nicholas'},
 {'Abstract': ' failur t cell protect cancer thought result lack antigen recognit chronic activ andor suppress cell use mous sarcoma model glucos consumpt tumor metabol restrict t cell lead dampen mtor activ glycolyt capac ifn product allow tumor progress we enhanc glycolysi antigen regressor tumor suffici overrid protect abil t cell control tumor growth we checkpoint blo

In [9]:
#https://stackoverflow.com/questions/42002859/creating-a-tf-idf-matrix-python-3-6

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [ab['Abstract'] for ab in ablist]


In [15]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarity_matrix(content_as_str):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(content_as_str) #fit the vectorizer to synopses
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return (similarity_matrix, tfidf_matrix)

In [16]:
(similarity_matrix, tfidf_matrix) = get_similarity_matrix(corpus)


In [30]:

from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 8.19 s, sys: 186 ms, total: 8.38 s
Wall time: 4.49 s


In [31]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [52]:
cluster_list=[]
for a in range(len(clusters)):
    papers={}
    papers['title']=ablist[a]['ArticleTitle']
    papers['author']=ablist[a]['Author']
    papers['cluster']=clusters[a]
    cluster_list.append(papers)

In [53]:
cluster_list

[{'author': 'McGranahan Nicholas',
  'cluster': 0,
  'title': 'Clonal Heterogeneity and Tumor Evolution: Past, Present, and the Future.'},
 {'author': 'Chang Chih-Hao',
  'cluster': 4,
  'title': 'Metabolic Competition in the Tumor Microenvironment Is a Driver of Cancer Progression.'},
 {'author': 'Sturm Dominik',
  'cluster': 4,
  'title': 'New Brain Tumor Entities Emerge from Molecular Classification of CNS-PNETs.'},
 {'author': 'DiLillo David J',
  'cluster': 4,
  'title': 'Differential Fc-Receptor Engagement Drives an Anti-tumor Vaccinal Effect.'},
 {'author': 'Overacre-Delgoffe Abigail E',
  'cluster': 4,
  'title': u'Interferon-\u03b3 Drives T'},
 {'author': 'Zelenay Santiago',
  'cluster': 4,
  'title': 'Cyclooxygenase-Dependent Tumor Growth through Evasion of Immunity.'},
 {'author': 'Grabocka Elda',
  'cluster': 1,
  'title': 'Mutant KRAS Enhances Tumor Cell Fitness by Upregulating Stress Granules.'},
 {'author': 'Boice Michael',
  'cluster': 1,
  'title': 'Loss of the HVEM Tu

In [55]:
frame=pd.DataFrame(cluster_list)

In [56]:
frame['cluster'].value_counts() 

1    426
4    269
2    118
0     99
3     75
Name: cluster, dtype: int64

In [66]:
from __future__ import print_function

print("Top terms per cluster:")

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 


for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()


Top terms per cluster:
Cluster 0 words:

NameError: name 'vocab_frame' is not defined

In [35]:
len(ablist)

987

In [36]:
len(clusters)

987

In [25]:

from sklearn.manifold import MDS
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import numpy as np


def get_cluster_kmeans(tfidf_matrix, num_clusters):
    km = KMeans(n_clusters = num_clusters)
    km.fit(tfidf_matrix)
    cluster_list = km.labels_.tolist()
    return cluster_list

def pca_reduction(similarity_matrix, n_components):
    one_min_sim = 1 - similarity_matrix
    pca = PCA(n_components=10)
    pos = pca.fit_transform(one_min_sim)
    x_pos, y_pos = pos[:, 0], pos[:, 1]
    return (x_pos, y_pos)
km_clusters = get_cluster_kmeans(tfidf_matrix, 5)  # KMeans
x_pos, y_pos = pca_reduction(similarity_matrix, 10)

In [28]:
km_clusters

[3,
 4,
 3,
 2,
 4,
 3,
 0,
 2,
 4,
 2,
 3,
 4,
 4,
 3,
 2,
 4,
 2,
 4,
 4,
 4,
 3,
 3,
 3,
 3,
 4,
 1,
 2,
 3,
 2,
 1,
 4,
 3,
 3,
 4,
 4,
 3,
 4,
 4,
 4,
 1,
 4,
 3,
 0,
 2,
 2,
 4,
 2,
 1,
 3,
 3,
 4,
 4,
 2,
 3,
 4,
 4,
 4,
 0,
 3,
 2,
 2,
 3,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 2,
 4,
 2,
 4,
 4,
 3,
 2,
 2,
 3,
 3,
 4,
 3,
 1,
 3,
 4,
 4,
 2,
 2,
 4,
 4,
 3,
 3,
 4,
 4,
 0,
 3,
 1,
 3,
 3,
 4,
 3,
 0,
 3,
 4,
 3,
 2,
 2,
 4,
 2,
 4,
 4,
 4,
 2,
 4,
 2,
 3,
 2,
 3,
 4,
 4,
 3,
 4,
 3,
 4,
 4,
 3,
 3,
 4,
 2,
 2,
 1,
 4,
 4,
 3,
 4,
 4,
 4,
 3,
 3,
 3,
 4,
 2,
 2,
 4,
 2,
 2,
 0,
 2,
 4,
 3,
 2,
 4,
 3,
 2,
 2,
 3,
 3,
 3,
 3,
 2,
 4,
 4,
 3,
 2,
 2,
 3,
 2,
 0,
 2,
 1,
 2,
 3,
 4,
 1,
 3,
 4,
 3,
 3,
 4,
 4,
 4,
 2,
 1,
 4,
 2,
 2,
 4,
 4,
 4,
 4,
 4,
 0,
 3,
 2,
 4,
 2,
 0,
 2,
 3,
 3,
 4,
 1,
 3,
 2,
 2,
 3,
 3,
 3,
 3,
 0,
 2,
 3,
 4,
 3,
 2,
 3,
 3,
 3,
 2,
 2,
 0,
 3,
 4,
 1,
 4,
 3,
 3,
 2,
 3,
 2,
 3,
 4,
 2,
 4,
 4,
 3,
 3,
 3,
 3,
 4,
 4,
 2,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 4,
 3,


In [26]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from scipy.cluster.hierarchy import ward, dendrogram


def scatter_clusters(x_pos, y_pos, clusters, titles):
    cluster_colors = {0: '#cc0000',
                      1: '#006600',
                      2: '#002699',
                      3: '#ffff33',
                      4: '#ffa64d',
                      5: '#000000'}
    # As many as items
    cluster_names = {0: '',
                 1: '',  
                 2: '', 
                 3: '',
                 4: '',
                 5: ''}
                 
    df = pd.DataFrame(dict(x= x_pos, y= y_pos, label= clusters, title= titles)) 
    groups = df.groupby('label')
    fig, ax = plt.subplots(figsize=(17, 9))  # Set size
    ax.set_axis_bgcolor('#e6f7ff')
    # Iterate through groups to layer the plot
    for name, group in groups:
        ax.plot(group.x, group.y, marker='D', linestyle='solid', ms=15, 
                label=cluster_names[name], color=cluster_colors[name], mec='black')
        ax.set_aspect('auto')
        ax.tick_params(axis= 'x', which='both', labelbottom='off')
        ax.tick_params(axis= 'y', which='both', labelleft='off')
    ax.legend(numpoints=1)

    for i in range(len(df)):
        ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size= 15)  
    plt.show() # Show the plot


In [27]:
scatter_clusters(x_pos, y_pos, km_clusters, authors) # Scatter K-means with PCA

NameError: name 'authors' is not defined