In [33]:
from pymed import PubMed

import datetime
import json
import pprint

In [2]:
def query_pubmed(search_term='longevity'):
    '''
    Using pymed API query PubMed database
    '''
    pubmed = PubMed(tool='MyTool', email='marko@delphikos.com')
    results = pubmed.query(search_term, max_results=2000)

    article_list = []
    for article in results:
        article_dict = article.toDict()
        article_list.append(article_dict)

    return article_list

In [5]:
dq_search_term = 'dasatinib AND quercetin'
dq_search_term = 'dasatinib AND (side effect* OR adverse event* OR adverse effect* OR risk*)'
article_list = query_pubmed(dq_search_term)

In [6]:
len(article_list)

1093

In [20]:
def extract_data(article_list):
    data = {}
    data['articles'] = []

    '''Extract data from search results'''
    keywords_all = set()
    for article in article_list:
        # what data we want
        pubmed_id = ''
        title = ''
        authors = ''
        publication_date = ''
        keywords = ''

        if 'pubmed_id' in article.keys():
            if '\n' in article['pubmed_id']:
                pubmed_id = article['pubmed_id'].split('\n')[0]
            else:
                pubmed_id = article['pubmed_id']

        if 'title' in article.keys():
            title = article['title']
            title = title.replace(u'\xa0', u' ')

        if 'authors' in article.keys():
            authors_list = article['authors']
            authors_string = ''
            for author in authors_list:
                if author['initials']:
                    authors_string += author['initials'] + '. '
                if author['lastname']:
                    authors_string += author['lastname']
                authors_string += ', '
            authors = authors_string[:-2]

        if 'publication_date' in article.keys():
            if isinstance(article['publication_date'], datetime.date):
                publication_date = article['publication_date'].strftime('%Y-%m-%d')
            else:
                publication_date = article['publication_date']

        if 'keywords' in article.keys():
            keywords_list = article['keywords']
            if keywords_list:
                keywords = ', '.join(article['keywords'])
                for keyword in keywords_list:
                    keywords_all.add(keyword)

        data['articles'].append({
            'pubmed_id': pubmed_id,
            'title': title,
            'authors': authors,
            'publication_date': publication_date,
            'keywords': keywords
        })

    return data

In [21]:
data = extract_data(article_list)

In [22]:
with open('../data/data.json') as f:
  data = json.load(f)

In [35]:
papers = data['articles']

In [36]:
pprint.pprint(papers[0])

{'authors': 'RR. White, AY. Maslov, M. Lee, SE. Wilner, M. Levy, J. Vijg',
 'keywords': 'DNA damage, DSB repair, FOXO3a, aging, bleomycin, mutations',
 'publication_date': '2020-07-29',
 'pubmed_id': '32720744',
 'title': 'FOXO3a acts to suppress DNA double-strand break-induced mutations.'}


When full text isn't available, try fetching the PDF through SciHub API and extract text from fetched PDF.

In [40]:
from scihub import SciHub
import PyPDF2

sh = SciHub()

In [39]:
result = sh.fetch('https://doi.org/10.1111/acel.13184')

INFO:Sci-Hub:Failed to fetch pdf with identifier https://doi.org/10.1111/acel.13184 (resolved url None) due to request exception.


In [32]:
pdfFileObject = open('../papers/10.1111@acel.13184.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
count = pdfReader.numPages
for i in range(count):
    page = pdfReader.getPage(i)
    print(page.extractText())

Aging Cell. 2020;00:e13184.|1 of 6https://doi.org/10.1111/acel.13184wileyonlinelibrary.com/journal/acel|

Extensive evidence supports the notion that somatic genome al
-terations are fundamental to aging, not only giving rise to cancer 
but possibly also causing non-cancer, age-related degeneration and 

disease (Kennedy, Loeb, & Herr,
 2012; Vijg & Suh,
 2013). Indeed, 
one defining characteristic of aging is the accumulation of somatic 
mutations and DNA damaging lesions arising from endogenous or 

environmental agents (Dolle et
 al.,
 1997; Martincorena et
 al.,
 2015; 

Maslov et
 al.,
 2013). Moreover, we have recently shown that DNA 
double-strand breaks (DSBs) are capable of accelerating multi
-ple aging pathologies in otherwise normal, young mice (White 

et
 al.,
 2015).
Certain gene families, such as sirtuins and FoxOs, have been 
linked to longevity in model organisms by regulating multiple 
 



|



|


DOI: 10.1111/acel.13184  





1|

1|

1|

2|

2|


This is an open a