# Scrape from biorxiv and from arxiv filtering by category, word in abstract/title, dates, etc.

In [1]:
#Set dates to search for

import datetime
today = datetime.datetime.now().date().strftime("%Y-%m-%d")
TimeBefore = (datetime.datetime.now().date()-datetime.timedelta(days=10)).strftime("%Y-%m-%d")

## Had to use a separate scraper for biorxiv

In [2]:
import requests
import pandas as pd
#import seaborn as sns
import numpy as np
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
%matplotlib inline

n_results = 1000
url = "http://biorxiv.org/search/neuroscience%20numresults%3A{}".format(
    n_results)
resp = requests.post(url)



In [3]:
# Let's see what html output looks like
text = bs(resp.text)
text

<!DOCTYPE html>

<html dir="ltr" lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:mml="http://www.w3.org/1998/Math/MathML">
<head prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book#">
<!--[if IE]><![endif]-->
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<link href="//stats.g.doubleclick.net" rel="dns-prefetch"/>
<link href="//d33xdlntwy0kbs.cloudfront.net" rel="dns-prefetch"/>
<link href="//www.google-analytics.com" rel="dns-prefetch"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=3, minimum-scale=1, user-scalable=yes" name="viewport"/>
<link href="https://www.biorxiv.org/sites/default/files/images/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<meta content="bioRxiv - the preprint server for biology, operated by Cold Spring Harbor Laboratory, a research and educational institution" name="description"/>
<meta content="Drupal 7 (http://drupal.org)" name="generator"/>
<link href

In [4]:
# Define the URL with start/stop dates
search_term = ""
url_base = "https://biorxiv.org/search/{}".format(search_term)
url_params = "jcode%3Abiorxiv%20limit_from%3A"+TimeBefore+"%20limit_to%3A"+today+"%20numresults%3A"+str(n_results)+"%20sort%3Arelevance-rank%20format_result%3Astandard".format(
    n_results)

url = url_base + url_params
print(url)

#https://www.biorxiv.org/search/jcode%3Abiorxiv%20limit_from%3A2019-05-07%20limit_to%3A2019-05-17%20numresults%3A10%20sort%3Arelevance-rank%20format_result%3Astandard

https://biorxiv.org/search/jcode%3Abiorxiv%20limit_from%3A2019-08-31%20limit_to%3A2019-09-10%20numresults%3A1000%20sort%3Arelevance-rank%20format_result%3Astandard


In [5]:
# Scraper
parser = 'html.parser'

all_articles = []
all_authors = []
this_url = url 
resp = requests.post(this_url)
time.sleep(100) #takes ages to load 1000 articles
html = bs(resp.text,parser)
print("Souping complete.")
        
# Collect the articles in the result in a list
articleData = [] # The article listings for this page, indexed by DOI
articles = html.find_all('li', attrs={'class': 'search-result'})
for article in articles:
    # Get the item header 
    citation = article.find('div', attrs={'class': 'highwire-article-citation'})
    master_version = citation.get('data-pisa-master')
    version = citation.get('data-pisa')
    atom_path = citation.get('data-apath')
    

    # Get abstract
    article_url = "https://www.biorxiv.org/content/"+atom_path
    art_resp = requests.post(article_url)
    article_html = bs(art_resp.text, parser)
    abstract = str(article_html.find('meta', attrs={'name': 'DC.Description'}))
    abstract = abstract.replace('<meta content="', "").replace('>"', "")

    # Get the DOI
    doispan = article.find('span', attrs={'class': 'highwire-cite-metadata-doi'})
    doi = doispan.text.strip().replace('doi: https://', '')
 
    # Get the title info
    title = article.find('span', attrs={'class': 'highwire-cite-title'})
    title = title.text.strip().replace("\n", "")
 
    # Now collect author information
    authors = article.find_all('span', attrs={'class': 'highwire-citation-author'})
    all_authors = []
    for author in authors:
        all_authors.append(author.text)
 
    author_list = ', '.join(all_authors)
    #outdata = pd.DataFrame(title, atom_path, author_list, version, columns=['title', 'atom_path', 'author_list', 'version'])
    outdata = [title, atom_path, author_list, abstract, version]
 
    articleData.append(outdata)


In [6]:
# Collect these into DataFrames
df_biorxiv = pd.DataFrame(articleData, columns=['title', 'atom_path', 'authors', 'abstract','version'])



In [7]:
df_biorxiv.shape

(1000, 5)

In [8]:
df_biorxiv.head(10)

Unnamed: 0,title,atom_path,authors,abstract,version
0,Draft genome sequences of Hirudo medicinalis a...,/biorxiv/early/2019/09/10/357681.atom,"Vladislav V. Babenko, Oleg V. Podgorny, Valent...",,biorxiv;357681v2
1,ATG13 dynamics in non-selective autophagy and ...,/biorxiv/early/2019/09/10/370114.atom,"Piero Dalle Pezze, Eleftherios Karanasios, Var...",,biorxiv;370114v2
2,BET inhibition prevents aberrant RUNX1 and ERG...,/biorxiv/early/2019/09/10/762781.atom,"Jisha Antony, Gregory Gimenez, Terry Taylor, U...",,biorxiv;762781v1
3,Skp1 Dimerization Conceals its F-box Protein B...,/biorxiv/early/2019/09/10/764126.atom,"Hyun W. Kim, Alexander Eletsky, Karen J. Gonza...",,biorxiv;764126v1
4,Co-ordinated Ras and Rac activity shapes macro...,/biorxiv/early/2019/09/10/763748.atom,"Catherine M Buckley, Henderikus Pots, Aurelie ...",,biorxiv;763748v1
5,DeepCryoPicker: Fully Automated Deep Neural Ne...,/biorxiv/early/2019/09/10/763839.atom,"Adil Al-Azzawi, Anes Ouadou, Max R Highsmith, ...",,biorxiv;763839v1
6,Metabolite therapy guided by liquid biopsy pro...,/biorxiv/early/2019/09/10/764100.atom,"Katherine Wert, Gabriel Velez, Kanchustambham ...",,biorxiv;764100v1
7,Expression Profile of Circular RNAs in Epicard...,/biorxiv/early/2019/09/10/764266.atom,"Xinchun Yang, Meili Zheng, Lei Zhao",,biorxiv;764266v1
8,Animal biosynthesis of complex polyketides in ...,/biorxiv/early/2019/09/10/764225.atom,"Joshua P. Torres, Zhenjian Lin, Jaclyn M. Wint...",,biorxiv;764225v1
9,FARFAR2: Improved de novo Rosetta prediction o...,/biorxiv/early/2019/09/10/764449.atom,"Rhiju Das, Andrew M Watkins",,biorxiv;764449v1


## Now scrape arxiv

In [9]:
#check that you have arxivscraper.py in directory or have installed it
import arxivscraper as ax

scraper = ax.Scraper(category='q-bio',date_from=TimeBefore,date_until=today,t=20 
                    )
output = scraper.scrape()

http://export.arxiv.org/oai2?verb=ListRecords&from=2019-08-31&until=2019-09-10&metadataPrefix=arXiv&set=q-bio
fetching up to  1000 records...
fetching is completed in 4.0 seconds.
Total number of records 155


In [10]:
cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')
df_arxiv = pd.DataFrame(output,columns=cols)

In [11]:
df_arxiv.head()

Unnamed: 0,id,title,categories,abstract,doi,created,updated,authors
0,1709.09236,large scale evaluation of differences between ...,q-bio.mn q-bio.gn,dendrograms are a way to represent evolutionar...,10.1371/journal.pone.0221631,2017-09-26,,"[gamermann, montagud, conejero, de córdoba, ur..."
1,1710.02431,emerging whole-cell modeling principles and me...,q-bio.qm,whole-cell computational models aim to predict...,10.1016/j.copbio.2017.12.013,2017-10-06,2017-12-08,"[goldberg, szigeti, chew, sekar, roth, karr]"
2,1803.02136,limit distribution of the quartet balance inde...,q-bio.pe math.pr,"this paper builds up on t. martinez-coronado, ...",,2018-03-06,2019-08-30,[bartoszek]
3,1803.05404,coupling the yoccoz-birkeland population model...,math.ds q-bio.pe,we propose a new model for the time evolution ...,10.1088/1361-6544/ab0eb7,2018-03-14,2018-05-09,"[arlot, marmi, papini]"
4,1803.08362,consciousness: from the perspective of the dyn...,q-bio.nc,"beings, animate or inanimate, are dynamical sy...",,2018-03-04,,[schad]


In [12]:
df_arxiv.shape

(155, 8)

In [13]:
#save as pickle for future use
#make sure you have data folders for this
import pickle
df_arxiv.to_pickle('arxiv_nobody'+today+'.pkl')
df_biorxiv.to_pickle('biorxiv_nobody'+today+'.pkl')


# Start here if you have previously scraped and pickled the articles that you want

## From here, we have a dataframe of arxiv articles and we want to grab the pdf body from.

In [1]:
#to read pickle in from previous scrape:
import pandas as pd
import datetime
today = datetime.datetime.now().date().strftime("%Y-%m-%d")
#today = "2019-05-17"
df_arxiv=pd.read_pickle('arxiv_nobody'+today+'.pkl')
df_biorxiv=pd.read_pickle('biorxiv_nobody'+today+'.pkl')


In [2]:
#function to download arxiv file as pdf temp file
import requests
import pickle

def download_file(download_url):
    print(download_url)
    agent = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(download_url, headers=agent)
    temp_filename = 'document_placeholder.pdf'
    with open(temp_filename, 'wb') as file:
        pickle.dump(response.content, file, protocol=pickle.HIGHEST_PROTOCOL)
    return temp_filename

## import body of a pdf as an example

In [6]:
from PdfConverter import PdfConverter
# use --> !pip3 install pdfminer.six
# read in pdf from arxiv url

#example running this on the article in row 4 of the dataframe
#pdf_filename=download_file("https://arxiv.org/pdf/%s.pdf" %df_arxiv.id.loc[10])
pdf_filename=download_file(df_biorxiv.atom_path.loc[4].replace('/biorxiv/', 'https://www.biorxiv.org/content/biorxiv/').replace( '.atom','')+'.full.pdf')

#Use class PdfConverter
pdfConverted = PdfConverter(file_path=pdf_filename)
text_out = pdfConverted.convert_pdf_to_txt()

#replace linebreaks with spaces in doc
text_out = text_out.replace('\n',' ')


https://www.biorxiv.org/content/biorxiv/early/2019/09/10/763748.full.pdf


In [7]:
print(text_out)

bioRxiv preprint first posted online Sep. 10, 2019;  (which was not peer-reviewed) is the author/funder, who has granted bioRxiv a license to display the preprint in perpetuity.  http://dx.doi.org/10.1101/763748 .   The copyright holder for this preprint  doi:   All rights reserved. No reuse allowed without permission.   Co-ordinated Ras and Rac activity shapes macropinocytic cups  and enables phagocytosis of geometrically diverse bacteria   1	 2	 3	 	 4	 Catherine	M.	Buckley1,	Henderikus	Pots2,	Aurelie	Gueho3$,	Ben	A.	Phillips1,	Bernd	 5	 Gilsbach4,	Anton	Nikolaev1,	Thierry	Soldati3,	Andrew	J.		Parnell5,	Arjan	Kortholt2	 6	 and	Jason	S.	King1*	 7	 	 8	 1Department	of	Biomedical	Sciences,	University	of	Sheffield,	Sheffield,	UK.		 2Department	of	Cell	Biochemistry,	University	of	Groningen,	Groningen,	 9	 Netherlands.	 10	 3Department	of	Biochemistry,	Faculty	of	Sciences,	Sciences	II,	University	of	 11	 12	 Geneva,	Geneva,	Switzerland	 4German	Centre	for	Neurodegenerative	Diseases,	Tübing

In [9]:
#example of doing above in one line
dfplay = df_arxiv.loc[[4]]
dfplay['body'] = dfplay['id'].apply(lambda x: PdfConverter(file_path=download_file("https://arxiv.org/pdf/%s.pdf" %x)).convert_pdf_to_txt().replace('\n',' '))

https://arxiv.org/pdf/1112.1724.pdf


In [None]:
dfplay.body[10]

In [11]:
#short example of looping through:
df_play = df_arxiv.iloc[0:5].copy()
df_play['body'] = ''
for ix in range(len(df_play.id)):
    #df_play['body'].loc[ix] = PdfConverter(file_path=download_file("https://arxiv.org/pdf/%s.pdf" %df_play['id'].loc[ix])).convert_pdf_to_txt().replace('\n',' ')
    df_play['body'].loc[ix] = PdfConverter(file_path=download_file(df_biorxiv.atom_path.loc[ix].replace('/biorxiv/', 
                                                                                                               'https://www.biorxiv.org/content/biorxiv/').replace( '.atom','')+'.full.pdf')).convert_pdf_to_txt().replace('\n',' ')





https://www.biorxiv.org/content/biorxiv/early/2019/04/04/445486.full.pdf
https://www.biorxiv.org/content/biorxiv/early/2019/04/04/596551.full.pdf
https://www.biorxiv.org/content/biorxiv/early/2019/04/04/597369.full.pdf
https://www.biorxiv.org/content/biorxiv/early/2019/04/04/585455.full.pdf
https://www.biorxiv.org/content/biorxiv/early/2019/04/04/598409.full.pdf


In [12]:
df_play.body.loc[4]

'bioRxiv preprint first posted online Apr. 4, 2019;  The copyright holder for this preprint (which was not peer-reviewed) is the author/funder, who has granted bioRxiv a license to display the preprint in perpetuity.  http://dx.doi.org/10.1101/598409 .   doi:   It is made available under a   CC-BY 4.0 International license  .  1  2  3  4  5  6  7  8  9  10 11  12 13 14 15  16  Proposal for a Global Adherence Scale for Acute Conditions (GASAC): a prospective   cohort study in two Emergency Departments  Mélanie Sustersic1,2*, Aurélie Gauchet3, Amélie Duvert4☯, Laure Gonnet4☯, Alison Foote4, Céline   Vermorel1, Benoit Allenet1, Jean-Luc Bosson1.   1 TIMC-IMAG,  CNRS-UMR 5525, Univ. Grenoble Alpes, Grenoble, F-38043 France   2 Emergency Department, Grenoble Mutualist Hospital Group (Groupe Hospitalier Mutualiste de   Grenoble), Grenoble, F-38000 France   3 Inter-University Psychology Laboratory, EA 4145, Univ. Grenoble Alpes, Grenoble, F-38040 France  4 Research Division, Grenoble Alpes Un

## loop which goes through all of the files and adds the text to column 'body'

In [8]:

#every once in a while the pdf reader *stumbles* - instead of crashing, 
#we want to pass over the article.


    
df_biorxiv['body'] = ''
for ix in range(len(df_biorxiv.atom_path)):
    try:
        df_biorxiv['body'].loc[ix] = PdfConverter(file_path=download_file(df_biorxiv.atom_path.loc[ix].replace('/biorxiv/', 'https://www.biorxiv.org/content/biorxiv/').replace( '.atom','')+'.full.pdf')).convert_pdf_to_txt().replace('\n',' ')
        df_biorxiv.to_pickle('biorxiv_withbody'+today+'.pkl')
        print('k')
    except:
        print('missed a file')
        pass
    

df_arxiv['body'] = ''
for ix in range(len(df_arxiv.id)):
    try:
        df_arxiv['body'].loc[ix] = PdfConverter(file_path=download_file("https://arxiv.org/pdf/%s.pdf" %df_arxiv['id'].loc[ix])).convert_pdf_to_txt().replace('\n',' ')
        df_arxiv.to_pickle('arxiv_withbody'+today+'.pkl')
        print('k')
    except:
        print('missed a file')
        pass

        

https://www.biorxiv.org/content/biorxiv/early/2019/09/10/357681.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/370114.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/762781.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/764126.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/763748.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/763839.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/764100.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/764266.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/764225.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/764449.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/10/763953.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/09/622373.full.pdf
k
https://www.biorxiv.org/content/biorxiv/early/2019/09/09/692525.full.pdf
k
https://www.biorxiv.org/c