In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import requests
import bs4
from bs4 import BeautifulSoup
from selenium.common.exceptions import *
from selenium import webdriver
from tqdm.notebook import tqdm
%matplotlib inline

In [2]:
def get_arxiv_total_entries(year, category):
    y = int(str(year)[-2:])
    origin_url = r"https://arxiv.org/list/{}/{}?skip=0".format(category, y)
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
    r = requests.get(origin_url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    text = soup.find(name="div", attrs={"id": "dlpage"})
    total_entries = text.find(name="small")
    total_entries = int(total_entries.contents[0].split()[3])
    return total_entries

In [3]:
def get_arxiv_paper_list(total_entries, year, category):
    y = int(str(year)[-2:])
    driver_path = r"C:\Users\YangWang\Desktop\machineLearning\indiaNewsClassification\chromedriver.exe"
    driver = webdriver.Chrome(driver_path)
    time.sleep(2)
    paper_list = []

    for num in range(int(total_entries/2000)+1):
        url = r"https://arxiv.org/list/{}/{}?skip={}&show=2000".format(category, y, num*2000)
        driver.get(url)
        time.sleep(5)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        content = soup.find(name="div", attrs={"id": "content"})
        dl = content.find(name="dl")

        dd = dl.find_all(name="dd")
        dt = dl.find_all(name="dt")
        for e1, e2 in zip(dd, dt):
            if e1.find(name="div", attrs={"class": "list-title mathjax"}) is not None:
                title = e1.find(name="div", attrs={"class": "list-title mathjax"}).text
            else:
                title = None
            if e1.find(name="span", attrs={"class": "primary-subject"}) is not None:
                subject = e1.find(name="span", attrs={"class": "primary-subject"}).string
            else:
                subject = None
            if e2.find(name="span", attrs={"class": "list-identifier"}) is not None:
                link = e2.find(name="span", attrs={"class": "list-identifier"}).find(name="a").get("href")
            else:
                link = None
            year = str(year)
            category = str(category)
            paper_list.append([title, subject, link, year, category])

    driver.quit()
    
    return paper_list

In [4]:
def dataframe_preprocessing(paper_list):
    df = pd.DataFrame(paper_list)
    df.columns = ["title", "subject", "link", "year", "category"]
    df.reset_index(drop=True)

    df["title"] = df["title"].map(lambda x: x.split(":")[1])
    df["title"] = df["title"].map(lambda x: x.split("\n")[0])
    df["link"] = df["link"].map(lambda x: "https://arxiv.org" + x if x is not None else None)
    df["year"] = df["year"].map(int)
    df["category"] = df["category"].map(int)
    
    return df

In [5]:
def get_arxiv_dataframe(queue, category, start_year=2010, end_year=2020):
    """
    category:
        "cs": computer science
        "math": mathematics
        "stat": statistics
        "eess": electrical engineering and systems science
        "q-fin": quantitative finance
    """
    final_df = pd.DataFrame()
    for year in tqdm(range(start_year, end_year+1)):
        total_entries = get_arxiv_total_entries(year, category)
        paper_list = get_arxiv_paper_list(total_entries, year, category)
        df = dataframe_preprocessing(paper_list)
        final_df = pd.concat([final_df, df])
        
        final_df = final_df.drop_duplicates()
        final_df.dropna(inplace=True)
    queue.put(final_df)

In [6]:
import threading
import queue

subject = ["cs", "math", "stat", "q-fin"]
df = {name: pd.DataFrame() for name in subject}
paper_df = pd.DataFrame()

threads = []
my_queue = queue.Queue()
for name, df in df.items():
    threads.append(threading.Thread(target = get_arxiv_dataframe, args = (my_queue, name)))
    #df = get_arxiv_dataframe(category = name, start_year=2010, end_year=2020)
    #paper_df = pd.concat([paper_df, df])
for thread in threads:
    thread.start()
for thread in threads:
    thread.join()
while my_queue.qsize() > 0:
    df = my_queue.get()
    paper_df = pd.concat([paper_df, df])
paper_df.reset_index(drop=True)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))







Unnamed: 0,title,subject,link,year
0,Bayesian Inference of Stochastic Volatility M...,Computational Finance (q-fin.CP),https://arxiv.org/abs/1001.0024,2010
1,Diagnosis and Prediction of Tipping Points in...,General Finance (q-fin.GN),https://arxiv.org/abs/1001.0265,2010
2,Multiscaled Cross-Correlation Dynamics in Fin...,Statistical Finance (q-fin.ST),https://arxiv.org/abs/1001.0497,2010
3,Adaptive Wave Models for Option Pricing Evolu...,Pricing of Securities (q-fin.PR),https://arxiv.org/abs/1001.0615,2010
4,A Security Price Volatile Trading Conditionin...,Trading and Market Microstructure (q-fin.TR),https://arxiv.org/abs/1001.0656,2010
...,...,...,...,...
662807,Heat transport bounds for a truncated model o...,Fluid Dynamics (physics.flu-dyn),https://arxiv.org/abs/2004.07204,2020
662808,A quantum system with a non-Hermitian Hamilto...,Quantum Physics (quant-ph),https://arxiv.org/abs/2004.07205,2020
662809,Enumerating minimal dominating sets in the (i...,Discrete Mathematics (cs.DM),https://arxiv.org/abs/2004.07214,2020
662810,A Small Improvement to the Upper Bound on the...,Data Structures and Algorithms (cs.DS),https://arxiv.org/abs/2004.07217,2020


In [7]:
paper_df.to_csv(r"C:\Users\YangWang\Desktop\crawler\Arxiv_Paper_Crawler\arxiv_paper.csv", index=False)