### Import Libraries

In [1]:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import re
import math

### Fuctions to extract info from CORE webpage

In [2]:
# this function for the getting information from the web page

def get_paperinfo(paper_url):

    #download the page
    session = HTMLSession()
    response=session.get(paper_url)

  # check successful response
    if response.status_code != 200:
        print('Status code:', response.status_code)
        raise Exception('Failed to fetch web page ')

  #parse using beautiful soup
    paper_doc = BeautifulSoup(response.text,'html.parser')

    return paper_doc


def core_attributes(doc):
    
    '''
    recieves a doc file from the get_paperinfo function
    and returns title, ref and abstract from the CORE webpage
    '''
    
    # text contains title and references
    text = doc.main.find_all('h3')
    # ab contains the abstracts
    ab = doc.find_all("div", itemprop="abstract")
    
    papers = []
  
    for i, (t,a) in enumerate(zip(text,ab)):
        title = t.get_text()
        ref = t.a.get('href')
        abstract = a.get_text()
        
        # check if the article does not have an abstract
        if abstract == '':
            print(f'Article{i}: {title} has an empty abstract')
        
        # else append to a list
        else:
            papers.append([title, ref, abstract])

    return papers


### Generation of pandas df with stored paper data

In [4]:
# Getting information from CORE
gene = 'rab39b'
doc = get_paperinfo(f'https://core.ac.uk/search?q={gene}+AND+language%3A"en"&page=1')
n_output = doc.find("div", class_="styles_header__dGlUR").p.get_text()
#n_output = n_output.p.get_text()
number = re.findall('[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?', n_output)
number = float(number[0].replace(',', ''))
round_ = math.ceil(number/10)

#Create empty dataframe
df = pd.DataFrame(columns = ['title', 'ref', 'abstract'])

#Iterate over different pages on the CORE database to obtain titlt, ref and abstract and add to df
for l in tqdm(range(1,round_)):
    doc = get_paperinfo(f'https://core.ac.uk/search?q={gene}+AND+language%3A%22en%22&page={l}')
    papers = core_attributes(doc)
    dft = pd.DataFrame(papers, columns = ['title', 'ref', 'abstract'])
    df = pd.concat([df,dft])  
    
#df.to_csv('paper_info.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for l in tqdm(range(1,round_)):


  0%|          | 0/12 [00:00<?, ?it/s]

Article6: Parkinson’s:A Disease of Aberrant Vesicle Trafficking has an empty abstract
Article4: Characterization of Parkinson's Disease-associated LRRK2 Kinase has an empty abstract
Article5: Genetic Heterogeneity of Autism Spectrum Disorders has an empty abstract
Article9: Pathogenic variants in movement disorders: modifiers, interactors and disease models has an empty abstract
Article2: Putative second hit rare genetic variants in families with seemingly GBA-associated Parkinson's disease has an empty abstract
Article4: The presence of two rare genomic syndromes, 1q21 deletion and Xq28 duplication, segregating independently in a family with intellectual disability has an empty abstract
Article9: Integrated Stress Response Activity Marks Stem Cells in Normal Hematopoiesis and Leukemia has an empty abstract
Article4: Comprehensive knockout analysis of the Rab family GTPases in epithelial cells has an empty abstract
Article7: Molecular and Cellular Mechanisms Affected in ALS has an empt

In [5]:
df

Unnamed: 0,title,ref,abstract
0,The late endocytic Rab39a GTPase regulates the...,https://core.ac.uk/works/40912718,"Given their obligate intracellular lifestyle, ..."
1,Twist1 Directly Regulates Genes That Promote C...,https://core.ac.uk/works/4005785,"Twist1, a basic helix-loop-helix transcription..."
2,Rab39a and Rab39b display different intracellu...,https://core.ac.uk/works/41052356,Rab GTPases define the identity and destiny of...
3,Effect of Rab39 on Autophagy,https://core.ac.uk/works/86243525,Rab蛋白属于Ras超家族，是其中最大的一个家族。在哺乳动物细胞中已经发现70种以上的Rab...
4,GM-CSF Signalling Boosts Dramatically IL-1Prod...,https://core.ac.uk/works/3954955,GM-CSF is mostly known for its capacity to pro...
...,...,...,...
4,Analysis of Helicobacter pylori VacA-containin...,https://core.ac.uk/works/62680127,The human pathogen Helicobacter pylori coloniz...
5,Investigating the Nuclear Function of the C9or...,https://core.ac.uk/works/8608392,Amyotrophic lateral sclerosis (ALS) is a termi...
6,The Investigation of Targets for Therapy in Br...,https://core.ac.uk/works/4764206,Gliomas are among the most difficult of tumour...
7,Autism Spectrum Disorders,https://core.ac.uk/works/126151060,Estimated prevalence rates of autism spectrum ...


#### Here comes the difficult part..... how to extract info from the abstracts..
One way to do it is to tokenize the texts

In [None]:
abstracts = df['abstract'].values

incubation_times = []

for a in abstracts:
    for sentence in t.split('. '):
        if 'incubation' in sentence:
            single_day = re.findall(r' \d{1,2} day', sentence)
            if len(single_day)==1:
                num = single_day[0].split(' ')
                incubation_times.append(float(num[1]))

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
# initialize
cv = CountVectorizer(stop_words='english') 
cv_matrix = cv.fit_transform(df['abstract'])
# create document term matrix
df_dtm = pd.DataFrame(cv_matrix.toarray(), index=df['title'].values, columns=cv.get_feature_names())



In [None]:
pd.set_option('display.max_columns', 5000)
df_dtm

In [40]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
#proceses = ['lysosome', 'endocytosis', 'secretion', 'lipidation', 'kinase', 'glycosilation', 'heparan sulfate']

abstract_list = []
for txt in df['abstract']:
    text = txt
    tokens = word_tokenize(text)
    stemmed = []
    for token in tokens:
        stemmed_word = ps.stem(token)
        stemmed.append(stemmed_word)
    abstract_list.append(stemmed)
print(abstract_list)

[['given', 'their', 'oblig', 'intracellular', 'lifestyl', ',', 'chlamydia', 'trachomati', 'ensur', 'their', 'access', 'to', 'multipl', 'host', 'sourc', 'of', 'essenti', 'lipid', 'by', 'interf', 'vesicular', 'transport', '.', 'these', 'bacteria', 'hijack', 'rab6-', ',', 'rab11-', 'and', 'rab14-control', 'traffick', 'pathway', 'to', 'acquir', 'sphingomyelin', 'from', 'the', 'golgi', 'apparatu', '.', 'anoth', 'import', 'sourc', 'of', 'sphingolipid', ',', 'phospholipid', 'and', 'cholesterol', 'are', 'multivesicular', 'bodi', '(', 'mvb', ')', '.', 'despit', 'their', 'particip', 'in', 'chlamydi', 'inclus', 'develop', 'and', 'bacteri', 'replic', ',', 'the', 'molecular', 'mechan', 'mediat', 'mvbs-inclus', 'interact', 'remain', 'unknown', '.', 'in', 'the', 'present', 'studi', ',', 'we', 'demonstr', 'that', 'rab39a', 'label', 'a', 'subset', 'of', 'late', 'endocyt', 'vesicl', '-mainli', 'mvbs-', 'that', 'move', 'along', 'microtubul', '.', 'moreov', ',', 'rab39a', 'is', 'activ', 'recruit', 'to', '

In [23]:
text = str(df['abstract'][0]

In [36]:
for txt in df['abstract']:
    text = txt
    break

In [37]:
text

'Given their obligate intracellular lifestyle, Chlamydia trachomatis ensure their access to multiple host sources of essential lipids by interfering vesicular transport. These bacteria hijack Rab6-, Rab11- and Rab14-controlled trafficking pathways to acquire sphingomyelin from the Golgi apparatus. Another important source of sphingolipids, phospholipids and cholesterol are multivesicular bodies (MVBs). Despite their participation in chlamydial inclusion development and bacterial replication, the molecular mechanisms mediating MVBs-inclusion interaction remain unknown. In the present study, we demonstrate that Rab39a labels a subset of late endocytic vesicles -mainly MVBs- that move along microtubules. Moreover, Rab39a is actively recruited to chlamydial inclusions throughout the pathogen life cycle by a bacterial-driven process and depending on its GTP/GDP binding state. Interestingly, Rab39a participates in the delivery of MVB and host sphingolipids to maturing chlamydial inclusions t