In [6]:
import urllib.request as libreq
import feedparser

import glob

from tqdm import tqdm

from doc2json.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json

# Collect pdf links

In [2]:
query = "https://export.arxiv.org/api/query?search_query=cat:cs.HC&start=0&max_results=2000&sortBy=lastUpdatedDate&sortOrder=descending"

In [None]:
with libreq.urlopen(query) as url:
    r = url.read()
# print(r)

In [9]:
# Parse the XML feed
# feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
# feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

feed = feedparser.parse(r)

print('Feed title: %s' % feed.feed.title)
print('Feed last updated: %s' % feed.feed.updated)
print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
print('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
print('startIndex for this query: %s'   % feed.feed.opensearch_startindex)

Feed title: ArXiv Query: search_query=cat:cs.HC&amp;id_list=&amp;start=0&amp;max_results=2000
Feed last updated: 2022-09-21T00:00:00-04:00
totalResults for this query: 10958
itemsPerPage for this query: 2000
startIndex for this query: 0


In [13]:
# Run through each entry, and print out information
pdfLinks = []
for entry in feed.entries:
    # print('e-print metadata')
    # print('arxiv-id: %s' % entry.id.split('/abs/')[-1])
    # print('Published: %s' % entry.published)
    # print('Title:  %s' % entry.title)
    
    # # feedparser v4.1 only grabs the first author
    # author_string = entry.author
    
    # # grab the affiliation in <arxiv:affiliation> if present
    # # - this will only grab the first affiliation encountered
    # #   (the first affiliation for the first author)
    # # Please email the list with a way to get all of this information!
    # try:
    #     author_string += ' (%s)' % entry.arxiv_affiliation
    # except AttributeError:
    #     pass
    
    # print('Last Author:  %s' % author_string)
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    # try:
    #     print('Authors:  %s' % ', '.join(author.name for author in entry.authors))
    # except AttributeError:
    #     pass

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            # print('abs page link: %s' % link.href)
            pass
        # print(link)
        elif link.title == 'pdf':
            # print('pdf link: %s' % link.href)
            pdfLinks.append(link.href)
    
    # # The journal reference, comments and primary_category sections live under 
    # # the arxiv namespace
    # try:
    #     journal_ref = entry.arxiv_journal_ref
    # except AttributeError:
    #     journal_ref = 'No journal ref found'
    # print('Journal reference: %s' % journal_ref)
    
    # try:
    #     comment = entry.arxiv_comment
    # except AttributeError:
    #     comment = 'No comment found'
    # print('Comments: %s' % comment)
    
    # # Since the <arxiv:primary_category> element has no data, only
    # # attributes, feedparser does not store anything inside
    # # entry.arxiv_primary_category
    # # This is a dirty hack to get the primary_category, just take the
    # # first element in entry.tags.  If anyone knows a better way to do
    # # this, please email the list!
    # print('Primary Category: %s' % entry.tags[0]['term'])
    
    # # Lets get all the categories
    # all_categories = [t['term'] for t in entry.tags]
    # print('All Categories: %s' % (', ').join(all_categories))
    
    # # The abstract is in the <summary> element
    # print('Abstract: %s' %  entry.summary)

In [16]:
len(pdfLinks)

2000

In [24]:
l

'http://arxiv.org/pdf/0712.2168v1'

# parse pdfs into jsons

In [38]:
import scipdf

import json


In [62]:
dPapers = []
for l in tqdm(pdfLinks[:100]):
    try:
        dPapers.append(
            # scipdf.parse_pdf(l + '.pdf')
            scipdf.parse_pdf_to_dict(l + '.pdf')
        )
    except:
        dPapers.append("")

100%|██████████| 100/100 [03:58<00:00,  2.38s/it]


In [63]:
len(dPapers)

100

# parse the jsons to rqs

In [64]:
import re
import glob

from tqdm import tqdm

from doc2json.grobid2json.tei_to_json import convert_tei_xml_file_to_s2orc_json, \
    convert_tei_xml_soup_to_s2orc_json

from bs4 import BeautifulSoup

## Using SciPDF Parser

In [65]:
# dPapersParsed = []
# for d in tqdm(dPapers):
#     try:
#         article_dict = scipdf.convert_article_soup_to_dict(d, as_list=False)
#         dPapersParsed.append(article_dict)
#     except:
#         dPapersParsed.append("")

In [71]:
def get_relatedWork_text(d):
    paper = d['sections']
    # if section name contains 'related work', get the section number, and then get all subsections
    section_num = '-1'
    text = []
    for d in paper:
        if 'related work' in d['heading'].lower():
            text = [d['heading'] + ": " + d['text']]
    return '\n'.join(text)

def get_intro_text(d):
    paper = d['sections']
    # if section name contains 'related work', get the section number, and then get all subsections
    section_num = '-1'
    text = []
    for d in paper:
        if 'intro' in d['heading'].lower():
            text = [d['heading'] + ": " + d['text']]
    return '\n'.join(text)

def get_RQ_text(d):
    paper = d['sections']
    texts = []
    # for dSec in paper['body_text']:
    text = str(paper).lower()
    # non greedy match
    regexp = re.compile(r'[- a-z([]*?(?:\d.|:) (?:what|how|why|is|are|can|to what extent) [^[?(.]*\?')
    # regexp = re.compile(r'(?:what|how|why|is|are|can|to what extent) [^[?]*\?')
    matches = regexp.findall(text)
    if matches:
        texts.extend(matches)
    return '\n'.join(texts)

In [72]:
rqs = []
intros = []
relatedWorks = []

for text in tqdm(dPapers):
    try:
        relatedWorks.append(get_relatedWork_text(text))
    except:
        relatedWorks.append("")
    try:
        intros.append(get_intro_text(text))
    except:
        intros.append("")
    try:
        rqs.append(get_RQ_text(text))
    except:
        rqs.append("")


100%|██████████| 100/100 [00:01<00:00, 54.33it/s]


In [73]:
print(
    sum([1 for t in relatedWorks if t]),
    sum([1 for t in intros if t]),
    sum([1 for t in rqs if t]),
)

11 67 13


In [75]:
relatedWorks

['',
 '',
 '',
 '',
 '',
 '',
 '',
 'RELATED WORK: The area of multi-platform UI development falls under the umbrella of what is being termed as the "variety challenges" [33]. There are new challenges for application and solution developers due to the emergence of a variety of users, a variety of devices and channels, and a variety of roles and functions. We would categorize the problem of multi-platform UI development arising due to the emergence of a variety of devices and channels. This research area is relatively new and there has not been a lot of published literature in this area. There have been some approaches towards solving this problem. Building "plastic interfaces" [7,30] is one such method in which the UIs are designed to "withstand variations of context of use while preserving usability". This methodology uses concepts from modelbased approaches that we will discuss next in building UIs.\nTranscoding [4,12,13] is a technique used in the World Wide Web for adaptively conve

## Using our method

In [50]:
def get_relatedWork_text(text):
    soup = BeautifulSoup(text, "xml")
    paper = convert_tei_xml_soup_to_s2orc_json(soup, "", "").as_json()['body_text']
    # if section name contains 'related work', get the section number, and then get all subsections
    section_num = '-1'
    for d in paper:
        if 'related work' in d['section'].lower():
            section_num = d['sec_num']
            break
    # get all subsections
    relatedWork = [d['section'] + ": " + d['text'] for d in paper if section_num and d['sec_num'] and d['sec_num'].startswith(section_num)]
    return '\n'.join(relatedWork)

def get_intro_text(text):
    soup = BeautifulSoup(text, "xml")
    paper = convert_tei_xml_soup_to_s2orc_json(soup, "", "").as_json()['body_text']
    # if section name contains 'related work', get the section number, and then get all subsections
    section_num = '-1'
    for d in paper:
        if 'intro' in d['section'].lower():
            section_num = d['sec_num']
            break
    # get all subsections
    text = [d['section'] + ": " + d['text'] for d in paper if section_num and d['sec_num'] and d['sec_num'].startswith(section_num)]
    return '\n'.join(text)

def get_RQ_text(text):
    soup = BeautifulSoup(text, "xml")
    paper = convert_tei_xml_soup_to_s2orc_json(soup, "", "").as_json()['body_text']
    texts = []
    # for dSec in paper['body_text']:
    text = str(paper).lower()
    # non greedy match
    regexp = re.compile(r'[- a-z([]*?(?:\d.|:) (?:what|how|why|is|are|can|to what extent) [^[?]*\?')
    # regexp = re.compile(r'(?:what|how|why|is|are|can|to what extent) [^[?]*\?')
    matches = regexp.findall(text)
    if matches:
        texts.extend(matches)
    return '\n'.join(texts)

In [52]:
rqs = []
intros = []
relatedWorks = []

for text in tqdm(dPapers):
    try:
        relatedWorks.append(get_relatedWork_text(text))
    except:
        relatedWorks.append("")
    try:
        intros.append(get_intro_text(text))
    except:
        intros.append("")
    try:
        rqs.append(get_RQ_text(text))
    except:
        rqs.append("")


100%|██████████| 100/100 [00:11<00:00,  8.61it/s]


In [54]:
print(
    sum([1 for t in relatedWorks if t]),
    sum([1 for t in intros if t]),
    sum([1 for t in rqs if t]),
)

2 43 14


In [None]:
# write results to jsonl file
# with open('allQuestions_s2orc.jsonl', 'w', encoding='utf8') as f:
with open('RQs_TEI2JSON.jsonl', 'w', encoding='utf8') as f:
    for idx, rq in enumerate(rqs):
        # if rq:
        if True:
            f.write(json.dumps(
                {
                    'intro': intros[idx],
                    'relatedWork': relatedWorks[idx],
                    'rq': rq
                }
            ))
            f.write('\n')