# ArXiv Crawler
This contains tools for retrieving and parsing both papers and metadata from arXiv.org using Tor for anonymous web requests.

### Imports

In [98]:
import requests
import PyPDF2
import re
import string
import fake_useragent
from bs4 import BeautifulSoup
import sqlite3
from time import sleep
import numpy as np
import pandas as pd
from TorCtl import TorCtl
import stem
import stem.connection
from stem import Signal
from stem.control import Controller
from __future__ import print_function, division, generators, unicode_literals, with_statement
import os
import traceback

In [None]:
references_pattern = re.compile('(^|\n)R\n?eferences\n', re.IGNORECASE)
# arxiv_citation_pattern = re.compile("arXiv:[0-9\.]+")
arxiv_citation_pattern = re.compile("(arxiv.org\/((pdf)|(abs))\/(([a-z-]+\/)?[0-9\.]+))|(arXiv(.org)?:(([a-z-]+\/)?[0-9\.]+))", re.IGNORECASE)
subject_pattern = re.compile('\(cs.[A-Z][A-Z]\)')
non_alpha_chars = string.punctuation + string.whitespace

## Tor

In [99]:
from getpass import getpass
tor_password = getpass('Enter Tor password:')

Enter Tor password:········


In [101]:
def get_new_ip(tor_password=None, controller_port=9051):
    old_ip = request("http://icanhazip.com/").text
    renew_connection(tor_password)
    new_ip = old_ip
    try:
        new_ip = request("http://icanhazip.com/").text
    except:
        new_ip = old_ip
    while new_ip == old_ip:
        sleep(np.random.random())
        renew_connection(tor_password)
        try:
            new_ip = request("http://icanhazip.com/").text
        except:
            new_ip == old_ip
            continue
    return new_ip

In [103]:
def renew_connection(tor_password, controller_port=9051):
    with Controller.from_port(port = 9051) as controller:
        if tor_password:
            controller.authenticate(password = tor_password)
        else:
            controller.authenticate()
        controller.signal(Signal.NEWNYM)
        controller.close()

In [104]:
# request a URL 
def request(url):
    # communicate with TOR via a local proxy (privoxy)
    ua = fake_useragent.UserAgent()
    head = ua.random
    proxy={'http': 'socks5://localhost:9050'}
    return requests.get(url, proxies=proxy, headers={'User-Agent': head})

In [108]:
get_new_ip(tor_password=tor_password)

AuthenticationFailure: 

## Raw Paper Processing

In [None]:
# Retrieve citations in an arxiv PDF
def extract_text_and_citations(pdf):
    try:
        num_pages = pdf.numPages
    except:
        return '', []
    in_citations = False
    text = []
    citations = []
    for page_num in range(num_pages):
        page_text = ''
        try:
            page_text = pdf.getPage(page_num).extractText()
#             print(page_text)
        except:
            print('Failed PDF parsing for document')
            break
        text.append(page_text)
#         if page_num == 9:
#             print(page_text)
        if re.search(references_pattern, page_text):
#             print("In Citations!")
            in_citations = True
        if in_citations:
#             print(page_num)
            for citation in re.findall(arxiv_citation_pattern, page_text):
                if citation[4] != '':
                    citations.append(citation[4])
                else:
                    citations.append(citation[8])
#             citations.extend(list(map(lambda x: x.strip('arXiv:' + non_alpha_chars), re.findall(arxiv_citation_pattern, page_text))))
    return "\n".join(text), citations

In [None]:
# Parse an arxiv PDF to find all details
def parse_arxiv_paper(paper_metadata, pdf, pdf_dir='.', text_dir='.', citation_dir='.'):
    # Can't have / in filename
    legal_filename = paper_metadata['id'].replace('/', '_')
    # Write the PDF to a local file
    pdf_file_name = pdf_dir + '/' + legal_filename + '.pdf'
    with open(pdf_file_name, 'wb') as f:
        f.write(pdf)
        
    # Read the pdf file in for processing
    pdf_reader = None
    try:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file_name)
    except:
        print('Bad file for paper {}'.format(paper_metadata['id']))
        return {'metadata': paper_metadata, 'citations': []}
    text, citations = extract_text_and_citations(pdf_reader)
    
    # Write the text to a local file
    with open(text_dir + '/' + legal_filename + '.txt', 'w') as f:
        f.write(text)
        
    # Write citations to a local file
    with open(citation_dir + '/' + 'citations.txt', 'a') as f:
        for citation in citations:
            f.write(paper_metadata['id'] + ',' + citation + ',' + str(1) + '\n')
    return {'metadata': paper_metadata, 'citations': citations} 

In [None]:
def get_arxiv_paper(arxiv_id, pdf_dir='.', text_dir='.', citation_dir='.', field=None):
    sleep(0.5*np.random.random())
    print('Retrieving Paper {}'.format(arxiv_id))
    r = request('https://arxiv.org/pdf/{}.pdf'.format(arxiv_id))
    print('Found Paper {}'.format(arxiv_id))
    paper_metadata = {}
    paper_metadata['id'] = arxiv_id

    return parse_arxiv_paper(paper_metadata, r.content, pdf_dir, text_dir, citation_dir)

In [None]:
def get_papers_text(papers, get_text=False, pdf_dir='.', text_dir='.', citation_dir='.', field=None, batch_size=10, start_index=0, end_index=-1):
    if end_index==-1:
        end_index = len(papers)

    # Make the citations directory if it doesn't exist
    if not os.path.exists(citation_dir):
        os.makedirs(citation_dir)

    # Make the papers and text directories for this field if it doesn't exist
    if field:
        pdf_dir = pdf_dir + '/' + field
        text_dir = text_dir + '/' + field
        if not os.path.exists(text_dir):
            os.makedirs(text_dir)
        if not os.path.exists(pdf_dir):
            os.makedirs(pdf_dir)
    # Go thru batches of papers
    for start in range(start_index, end_index)[0::batch_size]:
        # Get minibatch to operate on with same IP
        batch_ids = papers.iloc[start : start + batch_size].id
        # Reset IP
        print('Retrieved new IP {} for start {} of size {}'.format(get_new_ip(tor_password), start, batch_size))
        for arxiv_id in batch_ids:
            if os.path.exists(pdf_dir + '/' + arxiv_id.replace('/', '_') + '.pdf'):
                print('ID skipped')
                continue
            while True:
                try:
                    paper = get_arxiv_paper(arxiv_id, pdf_dir=pdf_dir, text_dir=text_dir, citation_dir=citation_dir, field=field)
#                     print(len(paper['citations']))
                    break
                except Exception as e:
                    print('Request failed, got new IP {}'.format(get_new_ip(tor_password)))
                    traceback.print_exc()
                    continue

In [None]:
def get_ids_from_db(db_url, field):
    conn = sqlite3.connect(db_url)
    cur = conn.cursor()
    res = cur.execute("SELECT * FROM Papers WHERE primary_subject = '" + field + "'")
    papers = res.fetchall()
    return pd.DataFrame(papers, columns=['id', 'url', 'title', 'year', 'month', 'field', 'primary_subject', 'secondary_subject', 'tertiary_subject'])

In [None]:
papers = get_ids_from_db('arxiv_raw.sqlite', 'IR')

In [None]:
get_papers_text(papers, pdf_dir='Projects/ArXiv/data/papers/pdf', text_dir='Projects/ArXiv/data/papers/text', 
                citation_dir='Projects/ArXiv/data/citations', field='ir', start_index=1632, batch_size=20)

## Metadata Processing

In [None]:
def get_papers_metadata(soup, cur, field, year, month, log_file):
    titles = soup.findAll('div', {'class': 'list-title'})
    authors = soup.findAll('div', {'class': 'list-authors'})
    paper_urls = soup.findAll('span', {'class': 'list-identifier'})
    subjects = soup.findAll('div', {'class': 'list-subjects'})
    if len(titles) != len(authors):
        print('number of titles and authors mismatch')
    else:
        for title, author, paper_url, subject in zip(titles, authors, paper_urls, subjects):
            arxiv_id = re.split(':', paper_url.find('a').text)[-1]
            url = paper_url.find('a')['href']
            title = title.contents[-1].strip()
            paper_subjects = list(map(lambda x: x.strip('()cs.' + field.lower()), re.findall(subject_pattern, subject.text)))
            secondary_subject = ''
            tertiary_subject = ''
            if (len(paper_subjects) > 1):
                secondary_subject = paper_subjects[1]
                if len(paper_subjects) > 2:
                    tertiary_subject = paper_subjects[2]
            try:
                cur.execute('''
                    INSERT OR IGNORE INTO Papers (id, url, title, year, month, category, primary_subject, secondary_subject, tertiary_subject) 
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)''', (arxiv_id, url, title, int(year), int(month), 'CS', paper_subjects[0], secondary_subject, tertiary_subject))

                cur.execute('SELECT id FROM Papers WHERE title = ? ', (title, ))
#                 print(title)
                paper_id = cur.fetchone()[0]

                paper_authors = [au.string.strip() for au in author.findAll('a')]
                for name in paper_authors:
                    cur.execute('''
                        INSERT OR IGNORE INTO Authors (name) 
                        VALUES (?)''', (name, ))
                    cur.execute('SELECT id FROM Authors WHERE name = ? ', (name, ))
                    author_id = cur.fetchone()[0]
                    cur.execute('''
                        INSERT OR REPLACE INTO Publications
                        (paper_id, author_id) VALUES (?, ?)''', (paper_id, author_id))
            except Exception as e:
                log_file.write('Insert Failed for Paper {}, ({}) --- {}\n'.format(title, arxiv_id, e))
                log_file.flush()

In [None]:
url = 'http://arxiv.org/list/cs.{}/{}{}?show=2000'
# fields = ['AI', 'CC', 'CG', 'CE', 'CL', 'CV', 'CY', 'CR', 'DB', 'DS', 'DL', 'DM', 'DC', 'ET', 'FL', 
#           'GT', 'GL', 'GR', 'AR', 'HC', 
# fields = ['IR', 'IT', 'LG', 'LO', 
# fields = ['MS', 'MA', 'MM', 'NI', 'NE', 
#           'NA', 'OS', 'OH', 'PF', 'PL', 'RO' ,'SI', 'SE', 'SD', 'SC', 'SY']
fields = ['LO']
months = ['{:0>2d}'.format(i+1) for i in range(12)]
years = ['{:0>2d}'.format(i) for i in range(94, 95)]

conn = sqlite3.connect('arxiv_raw.sqlite')
cur = conn.cursor()
cur.executescript('''
CREATE TABLE IF NOT EXISTS Papers (
    id TEXT NOT NULL PRIMARY KEY,
    url TEXT UNIQUE,
    title TEXT UNIQUE,
    year INTEGER,
    month INTEGER,
    category TEXT,
    primary_subject TEXT,
    secondary_subject TEXT,
    tertiary_subject TEXT
);
CREATE TABLE IF NOT EXISTS Authors (
    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
    name TEXT
);
CREATE TABLE IF NOT EXISTS Publications (
    paper_id TEXT, 
    author_id INTEGER,
    PRIMARY KEY (paper_id, author_id)
);
''')

log_file = open('failed_records.txt', 'w')

for field in fields:
    print('Starting Field {}...'.format(field))
    for year in years:
        print('Retrieved new IP: {}'.format(get_new_ip(tor_password)))
        for month in months:
#             sleep(2*np.random.random())
            query_url = url.format(field, year, month)
            print('Retrieving {}'.format(query_url))
            while True:
                try:                  
                    data = request(query_url).text
                    soup = BeautifulSoup(str(data))
                    get_papers_metadata(soup, cur, field, year, month, log_file)
                    conn.commit()
                    break
                except:
                    print('Request failed, got new IP {}'.format(get_new_ip(tor_password)))
                    continue
log_file.close()

In [None]:
import argparse

def main():
    parser = argparse.ArgumentParser(description="Do something.")
    parser.add_argument('-p', '--pdf_dir', required=False)
    parser.add_argument('-t', '--text_dir', required=False)
    parser.add_argument('-c', '--citation_dir', required=False)
    parser.add_argument('-P', '--tor_password', required=True)
    parser.add_argument('-f', '--field', required=True)
    parser.add_argument('-d', '--db_url', required=True)
    parser.add_argument('-b', '--batch_size', type=int, required=True)
    parser.add_argument('-s', '--start_index', type=int, required=True)
    parser.add_argument('-e', '--end_index', type=int, required=True)
    args = parser.parse_args()

    papers = get_ids_from_db(args.db_url, args.field)
    print('Found {} papers for category {}'.format(len(papers), args.field))
    get_papers_text(papers, batch_size=args.batch_size, citation_dir=args.citation_dir, field=args.field,
                    pdf_dir=args.pdf_dir, text_dir=args.text_dir, start_index=args.start_index, end_index=args.end_index)

In [None]:
import argparse

def main():
    parser = argparse.ArgumentParser(description="Do something.")
    parser.add_argument('-p', '--pdf_dir', required=False)
    parser.add_argument('-t', '--text_dir', required=False)
    parser.add_argument('-c', '--citation_dir', required=False)
    parser.add_argument('-P', '--tor_password', required=True)
    parser.add_argument('-f', '--field', required=True)
    parser.add_argument('-d', '--db_url', required=True)
    parser.add_argument('-b', '--batch_size', type=int, required=True)
    parser.add_argument('-s', '--start_index', type=int, required=True)
    parser.add_argument('-e', '--end_index', type=int, required=True)
    args = parser.parse_args()

    papers = get_ids_from_db(args.db_url, args.field)
    print('Found {} papers for category {}'.format(len(papers), args.field))
    get_papers_text(papers, batch_size=args.batch_size, citation_dir=args.citation_dir, field=args.field,
                    pdf_dir=args.pdf_dir, text_dir=args.text_dir, start_index=args.start_index, end_index=args.end_index)
    
if __name__ == '__main__':
    main()

## Aggregate/Add Citations

In [93]:
# DB Connections
from sqlalchemy import create_engine
DB_URL = 'arxiv_raw.sqlite'
sqlite_conn = sqlite3.connect(DB_URL)
sqlalchemy_conn = create_engine('sqlite:///{}'.format(DB_URL))

In [None]:
citations_root = 'remote_data/'
citations_filename = 'citations/citations.txt'
for i in range(1, 21):
    print("Working on set {}".format(i))
    with open("{}{}/{}".format(citations_root, i, citations_filename)) as f:
        lines = f.readlines()
        for line in lines:
            line = line.replace('\n', '')
            fields = line.split(',')
            insert = "INSERT INTO Citations VALUES ('{}', '{}', {});".format(fields[0], fields[1], fields[2])
#             print(insert)
            try: 
                sqlite_conn.execute(insert)
                sqlite_conn.commit()
            except:
#                 print("Duplicate ID:" + str(fields))
sqlite_conn.close()

### Google Scholar Citations

In [9]:
import importlib
import scholar

In [52]:
importlib.reload(scholar)

<module 'scholar' from '/Users/paulburkard/Projects/ArXiv/scholar.py'>

In [109]:
querier = scholar.ScholarQuerier()
settings = scholar.ScholarSettings()
query = scholar.SearchScholarQuery()

In [110]:
query.set_include_citations(False)
query.set_include_patents(False)
query.set_num_page_results(1)
query.set_phrase('Safe cooperative robot dynamics on graphs')

In [111]:
querier.send_query(query)

In [112]:
article = querier.articles[0]

IndexError: list index out of range

In [57]:
article.attrs

{'cluster_id': ['8835157138450230859', 'Cluster ID', 5],
 'excerpt': ['This paper introduces the use of vector fields to design, optimize, and implement reactive schedules for safe cooperative robot patterns on planar graphs. We consider automated guided vehicles (AGVs) operating upon a predefined network of pathways. In contrast to the case of locally Euclidean configuration spaces, regularization of collisions is no longer a local procedure, and issues concerning the global topology of configuration spaces must  ...',
  'Excerpt',
  10],
 'num_citations': [42, 'Citations', 3],
 'num_versions': [26, 'Versions', 4],
 'title': ['Safe cooperative robot dynamics on graphs', 'Title', 0],
 'url': ['http://epubs.siam.org/doi/abs/10.1137/S0363012900368442', 'URL', 1],
 'url_citation': [None, 'Citation link', 9],
 'url_citations': ['http://scholar.google.com/scholar?cites=8835157138450230859&as_sdt=2005&sciodt=1,5&hl=en',
  'Citations list',
  7],
 'url_pdf': [None, 'PDF link', 6],
 'url_versi

In [89]:
r = requests.get(article.attrs['url_citations'][0])

In [90]:
soup = BeautifulSoup(r.text)

In [91]:
citations_count = int(soup.find(id='gs_ab_md').text.split()[1])

In [97]:
citations = soup.find_all(attrs={'class': 'gs_rt'})
for citation in citations:
    link = citation.find('a')
    count = sqlite_conn.execute("SELECT COUNT(*) FROM Papers WHERE title='{}'".format(link.text)).fetchone()[0]
    print("Title: {}".format(link.text))
    print("Link: {}".format(link['href']))
    print(count)

Title: Coordinating multiple robots with kinodynamic constraints along specified paths
Link: http://journals.sagepub.com/doi/abs/10.1177/0278364905051974
0
Title: Configuration spaces and braid groups on graphs in robotics
Link: https://arxiv.org/abs/math/9905023
0
Title: M*: A complete multirobot path planning algorithm with performance bounds
Link: http://ieeexplore.ieee.org/abstract/document/6095022/
0
Title: Finding topology in a factory: configuration spaces
Link: http://www.jstor.org/stable/2695326
0
Title: State complexes for metamorphic robots
Link: http://journals.sagepub.com/doi/abs/10.1177/0278364904045468
1
Title: Algorithms for collision-free navigation of mobile robots in complex cluttered environments: a survey
Link: https://www.cambridge.org/core/journals/robotica/article/algorithms-for-collision-free-navigation-of-mobile-robots-in-complex-cluttered-environments-a-survey/ADA8F6F7E30123629A26B08DA0C79C8C
0
Title: Discrete Morse theory and graph braid groups
Link: http://

In [96]:
ids_to_check = 

NoneType