In [1]:
import pandas as pd

from bs4 import BeautifulSoup
import requests

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from string import ascii_lowercase

In [2]:
# for a particular disease
disease = 'asthma'
begin_date = '2018-01-22'
end_date = '2018-06-22'

url = 'https://wsearch.nlm.nih.gov/ws/query?db=healthTopics&term='+ disease+\
    '&rettype=brief'
url2 = 'https://www.ncbi.nlm.nih.gov/pmc/oai/' +\
      'oai.cgi?verb=ListRecords&from={}&until={}'\
      .format(begin_date, end_date) +\
      '&set=bmcbioc&metadataPrefix=pmc'
print (url)
print (url2)

https://wsearch.nlm.nih.gov/ws/query?db=healthTopics&term=asthma&rettype=brief
https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=ListRecords&from=2018-01-22&until=2018-06-22&set=bmcbioc&metadataPrefix=pmc


In [3]:
#do we have a good return status from the site
r = requests.get(url)
r2 = requests.get(url2)

print(r.status_code)
print(r2.status_code)

200
200


In [4]:
print(r.headers['Content-Type'])
print(r2.headers['Content-Type'])

text/xml; charset=UTF-8
text/xml


In [33]:
soup = BeautifulSoup(r.content, "xml")
soup2 = BeautifulSoup(r2.content, "lxml")

In [34]:
medline = soup.find_all("content", {"name" : "FullSummary"})
pmcrawdata = soup2.find_all("record")

In [35]:
pmc = []
for th in pmcrawdata:
    pmc.extend(th.find_all("abstract"))

In [36]:
def textCleaner(summaries):
    """ Takes in a list of bs4.element.tag datatype object are returns a pandas dataframe object """
    for i in xrange(len(summaries)):
        summaries[i] = summaries[i].text
        summaries[i] = BeautifulSoup(summaries[i],"lxml").text
        
    return pd.DataFrame(data=summaries,columns=['text'])

In [37]:
cleanmedline = textCleaner(medline)

In [38]:
cleanmedline.head()

Unnamed: 0,text
0,Asthma is a chronic disease that affects your ...
1,Asthma is a chronic disease that affects your ...
2,An allergy is a reaction by your immune system...
3,"Each spring, summer, and fall, trees, weeds, a..."
4,Molds are fungi that can be found both outdoor...


In [39]:
cleanedPMC = textCleaner(pmc)

In [40]:
#Stemmer
stemmer = SnowballStemmer('english')
def stemsList(l):
    """ Input : list ; Output : list of stemmed words """
    return list(map(lambda x: stemmer.stem(x), l))


#stopwords
stop = stopwords.words('english')
otherstop = set(['','\n'])
stop = set(stemsList(stop))
stop = stop | otherstop
def removeStopwords(s):
    return [w for w in s.split(' ') if w not in stop]

#cleantext
cleanedPMC['text'].replace('[!"#%\'()*+,./:;<=>?@\[\]^_`{|}~’”“′‘\\\]',' ',inplace=True,regex=True) # remove characters
cleanedPMC['text'].replace('\n',' ',inplace=True,regex=True) # remove characters
cleanedPMC['text'] = cleanedPMC['text'].map(lambda x: x.lower()) # lower case
cleanedPMC['list'] = cleanedPMC['text'].map(lambda s: stemsList(removeStopwords(s)))

#cleantext
cleanmedline['text'].replace('[!"#%\'()*+,./:;<=>?@\[\]^_`{|}~’”“′‘\\\]',' ',inplace=True,regex=True) # remove characters
cleanmedline['text'].replace('\n',' ',inplace=True,regex=True) # remove characters
cleanmedline['text'] = cleanmedline['text'].map(lambda x: x.lower()) # lower case
cleanmedline['list'] = cleanmedline['text'].map(lambda s: stemsList(removeStopwords(s)))

In [41]:
cleanedPMC.text[0]

u'background enzymes display high reactivity and selectivity under natural conditions  but may suffer from decreased efficiency in industrial applications  a strategy to address this limitation is to immobilize the enzyme  mesoporous silica materials offer unique properties as an immobilization support  such as high surface area and tunable pore size    results the performance of a commercially available feruloyl esterase  e-faeru  immobilized on mesoporous silica by physical adsorption was evaluated for its transesterification ability  we optimized the immobilization conditions by varying the support pore size  the immobilization buffer and its ph  maximum loading and maximum activity were achieved at different phs  4 0 and 6 0 respectively   selectivity  shown by the transesterification hydrolysis products molar ratio  varied more than 3-fold depending on the reaction buffer used and its ph  under all conditions studied  hydrolysis was the dominant activity of the enzyme  ph and wate

In [42]:
print(cleanedPMC.list[0])

[u'background', u'enzym', u'display', u'high', u'reactiv', u'select', u'natur', u'condit', u'may', u'suffer', u'decreas', u'effici', u'industri', u'applic', u'strategi', u'address', u'limit', u'immobil', u'enzym', u'mesopor', u'silica', u'materi', u'offer', u'uniqu', u'properti', u'immobil', u'support', u'high', u'surfac', u'area', u'tunabl', u'pore', u'size', u'result', u'perform', u'commerci', u'avail', u'feruloyl', u'esteras', u'e-faeru', u'immobil', u'mesopor', u'silica', u'physic', u'adsorpt', u'evalu', u'it', u'transesterif', u'abil', u'optim', u'immobil', u'condit', u'vari', u'support', u'pore', u'size', u'immobil', u'buffer', u'it', u'ph', u'maximum', u'load', u'maximum', u'activ', u'achiev', u'differ', u'phs', u'4', u'0', u'6', u'0', u'respect', u'select', u'shown', u'transesterif', u'hydrolysi', u'product', u'molar', u'ratio', u'vari', u'3-fold', u'depend', u'reaction', u'buffer', u'use', u'it', u'ph', u'condit', u'studi', u'hydrolysi', u'domin', u'activ', u'enzym', u'ph', u'

In [43]:
cleanmedline.text[0]

u'asthma is a chronic disease that affects your airways  your airways are tubes that carry air in and out of your lungs  if you have asthma  the inside walls of your airways become sore and swollen  that makes them very sensitive  and they may react strongly to things that you are allergic to or find irritating  when your airways react  they get narrower and your lungs get less air symptoms of asthma includewheezingcoughing  especially early in the morning or at nightchest tightnessshortness of breathnot all people who have asthma have these symptoms  having these symptoms doesn t always mean that you have asthma  your doctor will diagnose asthma based on lung function tests  your medical history  and a physical exam  you may also have allergy tests when your asthma symptoms become worse than usual  it s called an asthma attack  severe asthma attacks may require emergency care  and they can be fatal asthma is treated with two kinds of medicines  quick-relief medicines to stop asthma sy

In [45]:
print(cleanmedline.list[0])

[u'asthma', u'chronic', u'diseas', u'affect', u'airway', u'airway', u'tube', u'carri', u'air', u'lung', u'asthma', u'insid', u'wall', u'airway', u'becom', u'sore', u'swollen', u'make', u'veri', u'sensit', u'may', u'react', u'strong', u'thing', u'allerg', u'find', u'irrit', u'airway', u'react', u'get', u'narrow', u'lung', u'get', u'less', u'air', u'symptom', u'asthma', u'includewheezingcough', u'especi', u'earli', u'morn', u'nightchest', u'tightnessshort', u'breathnot', u'peopl', u'asthma', u'symptom', u'have', u'symptom', u'alway', u'mean', u'asthma', u'doctor', u'diagnos', u'asthma', u'base', u'lung', u'function', u'test', u'medic', u'histori', u'physic', u'exam', u'may', u'also', u'allergi', u'test', u'asthma', u'symptom', u'becom', u'wors', u'usual', u'call', u'asthma', u'attack', u'sever', u'asthma', u'attack', u'may', u'requir', u'emerg', u'care', u'fatal', u'asthma', u'treat', u'two', u'kind', u'medicin', u'quick-relief', u'medicin', u'stop', u'asthma', u'symptom', u'long-term', 