### Import Libraries

In [1]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import re
import math

### Fuctions to extract info from CORE webpage

In [34]:
# this function for the getting information from the web page

def get_paperinfo(paper_url):

    #download the page
    session = HTMLSession()
    response=session.get(paper_url)

  # check successful response
    if response.status_code != 200:
        print('Status code:', response.status_code)
        raise Exception('Failed to fetch web page ')

  #parse using beautiful soup
    paper_doc = BeautifulSoup(response.text,'html.parser')

    return paper_doc


def core_attributes(doc):
    
    '''
    recieves a doc file from the get_paperinfo function
    and returns title, ref and abstract from the CORE webpage
    '''
    
    # text contains title and references
    text = doc.main.find_all('h3')
    # ab contains the abstracts
    ab = doc.find_all("div", itemprop="abstract")
    
    papers = []
  
    for i, (t,a) in enumerate(zip(text,ab)):
        title = t.get_text()
        ref = t.a.get('href')
        abstract = a.get_text()
        
        # check if the article does not have an abstract
        if abstract == '':
            print(f'Article{i}: {title} has an empty abstract')
        
        # else append to a list
        else:
            papers.append([title, ref, abstract])

    return papers


### Generation of pandas df with stored paper data

In [45]:
# Getting information from CORE
gene = 'rab39b'
doc = get_paperinfo(f'https://core.ac.uk/search?q={gene}+AND+language%3A"en"&page=1')
n_output = doc.find("div", class_="styles_header__dGlUR").p.get_text()
#n_output = n_output.p.get_text()
number = re.findall('[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?', n_output)
number = float(number[0].replace(',', ''))
round_ = math.ceil(number/10)

#Create empty dataframe
df = pd.DataFrame(columns = ['title', 'ref', 'abstract'])

#Iterate over different pages on the CORE database to obtain titlt, ref and abstract and add to df
for l in tqdm(range(1,round_)):
    doc = get_paperinfo(f'https://core.ac.uk/search?q={gene}+AND+language%3A%22en%22&page={l}')
    papers = core_attributes(doc)
    dft = pd.DataFrame(papers, columns = ['title', 'ref', 'abstract'])
    df = pd.concat([df,dft])  
    
#df.to_csv('paper_info.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for l in tqdm(range(1,round_)):


  0%|          | 0/12 [00:00<?, ?it/s]

Article6: Parkinson’s:A Disease of Aberrant Vesicle Trafficking has an empty abstract
Article3: Characterization of Parkinson's Disease-associated LRRK2 Kinase has an empty abstract
Article3: Genetic Heterogeneity of Autism Spectrum Disorders has an empty abstract
Article7: Pathogenic variants in movement disorders: modifiers, interactors and disease models has an empty abstract
Article0: Putative second hit rare genetic variants in families with seemingly GBA-associated Parkinson's disease has an empty abstract
Article2: The presence of two rare genomic syndromes, 1q21 deletion and Xq28 duplication, segregating independently in a family with intellectual disability has an empty abstract
Article7: Integrated Stress Response Activity Marks Stem Cells in Normal Hematopoiesis and Leukemia has an empty abstract
Article2: Comprehensive knockout analysis of the Rab family GTPases in epithelial cells has an empty abstract
Article4: Molecular and Cellular Mechanisms Affected in ALS has an empt

In [47]:
df.to_csv('paper_info.csv')