In [1]:
import pandas as pd

from bs4 import BeautifulSoup
import requests

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from string import ascii_lowercase

In [141]:
# for a particular disease
disease = 'asthma'
begin_date = '2018-01-22'
end_date = '2018-06-22'

url = 'https://wsearch.nlm.nih.gov/ws/query?db=healthTopics&term='+ disease+\
    '&rettype=brief'
url2 = 'https://www.ncbi.nlm.nih.gov/pmc/oai/' +\
      'oai.cgi?verb=ListRecords&from={}&until={}'\
      .format(begin_date, end_date) +\
      '&set=bmcbioc&metadataPrefix=pmc'
print (url)
print (url2)

https://wsearch.nlm.nih.gov/ws/query?db=healthTopics&term=asthma&rettype=brief
https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=ListRecords&from=2018-01-22&until=2018-06-22&set=bmcbioc&metadataPrefix=pmc


In [142]:
#do we have a good return status from the site
r = requests.get(url)
r2 = requests.get(url2)

print(r.status_code)
print(r2.status_code)

200
200


In [143]:
print(r.headers['Content-Type'])
print(r2.headers['Content-Type'])

text/xml; charset=UTF-8
text/xml


In [147]:
soup1 = BeautifulSoup(r.content, "xml")
soup2 = BeautifulSoup(r2.content, "lxml")

In [170]:
medline = soup.find_all("content", {"name" : "FullSummary"})
pmcrawdata = soup2.find_all("record")

In [171]:
pmc = []
for th in pmcrawdata:
    pmc.extend(th.find_all("abstract"))

In [172]:
def textCleaner(summaries):
    """ Takes in a list of bs4.element.tag datatype object are returns a pandas dataframe object """
    for i in xrange(len(summaries)):
        summaries[i] = summaries[i].text
        summaries[i] = BeautifulSoup(summaries[i],"lxml").text
        
    return pd.DataFrame(data=summaries,columns=['text'])

In [173]:
cleanmedline = textCleaner(medline)

In [174]:
cleanmedline.head()

Unnamed: 0,text
0,Asthma is a chronic disease that affects your ...
1,Asthma is a chronic disease that affects your ...
2,An allergy is a reaction by your immune system...
3,"Each spring, summer, and fall, trees, weeds, a..."
4,Molds are fungi that can be found both outdoor...


In [175]:
cleanedPMC = summaryCleaner(pmc)

In [181]:
cleaned_summaries = cleanedPMC
cleaned_summaries.append(cleanmedline,ignore_index='True')

Unnamed: 0,list,text
0,"[background\nenzym, display, high, reactiv, se...",background\nenzymes display high reactivity an...
1,"[background\nan, import, step, human, immunode...",background\nan important step in human immunod...
2,"[background\ncorynebacterium, urealyticum, pat...",background\ncorynebacterium urealyticum a pat...
3,,Asthma is a chronic disease that affects your ...
4,,Asthma is a chronic disease that affects your ...
5,,An allergy is a reaction by your immune system...
6,,"Each spring, summer, and fall, trees, weeds, a..."
7,,Molds are fungi that can be found both outdoor...
8,,Occupational health problems occur at work or ...
9,,"Ozone is a gas. It can be good or bad, dependi..."


In [182]:
#Stemmer
stemmer = SnowballStemmer('english')
def stemsList(l):
    """ Input : list ; Output : list of stemmed words """
    return list(map(lambda x: stemmer.stem(x), l))


#stopwords
stop = stopwords.words('english')
otherstop = set(['','\n'])
stop = set(stemsList(stop))
stop = stop | otherstop
def removeStopwords(s):
    return [w for w in s.split(' ') if w not in stop]

#cleantext
cleaned_summaries['text'].replace('[!"#%\'()*+,./:;<=>?@\[\]^_`{|}~’”“′‘\\\]',' ',inplace=True,regex=True) # remove characters
cleaned_summaries['text'].replace('\n',' ',inplace=True,regex=True) # remove characters
cleaned_summaries['text'] = cleaned_summaries['text'].map(lambda x: x.lower()) # lower case
cleaned_summaries['list'] = cleaned_summaries['text'].map(lambda s: stemsList(removeStopwords(s)))

In [183]:
cleaned_summaries.text[0]

u'background enzymes display high reactivity and selectivity under natural conditions  but may suffer from decreased efficiency in industrial applications  a strategy to address this limitation is to immobilize the enzyme  mesoporous silica materials offer unique properties as an immobilization support  such as high surface area and tunable pore size    results the performance of a commercially available feruloyl esterase  e-faeru  immobilized on mesoporous silica by physical adsorption was evaluated for its transesterification ability  we optimized the immobilization conditions by varying the support pore size  the immobilization buffer and its ph  maximum loading and maximum activity were achieved at different phs  4 0 and 6 0 respectively   selectivity  shown by the transesterification hydrolysis products molar ratio  varied more than 3-fold depending on the reaction buffer used and its ph  under all conditions studied  hydrolysis was the dominant activity of the enzyme  ph and wate

In [184]:
print(cleaned_summaries.list[0])

[u'background', u'enzym', u'display', u'high', u'reactiv', u'select', u'natur', u'condit', u'may', u'suffer', u'decreas', u'effici', u'industri', u'applic', u'strategi', u'address', u'limit', u'immobil', u'enzym', u'mesopor', u'silica', u'materi', u'offer', u'uniqu', u'properti', u'immobil', u'support', u'high', u'surfac', u'area', u'tunabl', u'pore', u'size', u'result', u'perform', u'commerci', u'avail', u'feruloyl', u'esteras', u'e-faeru', u'immobil', u'mesopor', u'silica', u'physic', u'adsorpt', u'evalu', u'it', u'transesterif', u'abil', u'optim', u'immobil', u'condit', u'vari', u'support', u'pore', u'size', u'immobil', u'buffer', u'it', u'ph', u'maximum', u'load', u'maximum', u'activ', u'achiev', u'differ', u'phs', u'4', u'0', u'6', u'0', u'respect', u'select', u'shown', u'transesterif', u'hydrolysi', u'product', u'molar', u'ratio', u'vari', u'3-fold', u'depend', u'reaction', u'buffer', u'use', u'it', u'ph', u'condit', u'studi', u'hydrolysi', u'domin', u'activ', u'enzym', u'ph', u'