# DATA 601 - Project 2

Arxiv hosts lot of published research papers. We are using the Arxiv API to access the metadata of research papers and do the rest of tasks. 

In [39]:
import arxiv
import json 
import time
import numpy as np
import pandas as pd
import pickle

Defined a function below called get_results to get the papers results in category wise.

The following query retrieves all the papers with category code, We will later filter the data according to primary codes.
In the query setting, put the category code.
Collect the paper result in a list.
Even though we get error on the basis of UnexpectedEmptyPageError, our list will contain information.

In [40]:
def get_results(category):
    results = arxiv.Client(
    page_size=2000, delay_seconds=3, num_retries=3).results(
        arxiv.Search(
            query = category,
            sort_order=arxiv.SortOrder.Descending,
        ))
    return results

paper_results = [] # this is the list that hosts all the papers information.


Using the get_results function to append each of the category results into papers_results list

In [41]:
#First usage of get_results function to append all the papers with 'cs.AI' as category.

AI_results = get_results('cs.AI')
for AI_papers in AI_results:
    paper_results.append(AI_papers)

UnexpectedEmptyPageError: Page of results was unexpectedly empty (http://export.arxiv.org/api/query?search_query=cs.AI&id_list=&sortBy=relevance&sortOrder=descending&start=50000&max_results=2000)

In [42]:
len(paper_results) # to confirm or view how many results we get

50000

In [43]:
# Using of get_results function to append all the papers with 'cs.LG' or Machine learning as category.

LG_results = get_results('cs.LG')

for LG_papers in LG_results:
    paper_results.append(LG_papers)

UnexpectedEmptyPageError: Page of results was unexpectedly empty (http://export.arxiv.org/api/query?search_query=cs.LG&id_list=&sortBy=relevance&sortOrder=descending&start=50000&max_results=2000)

In [44]:
# Length of paper_results list after executing machine learning code
len(paper_results) 

100000

In [45]:
# Using of get_results function to append all the papers with 'cs.CC' or Computational Complexity as category.

CC_results = get_results('cs.CC')

for CC_papers in CC_results:
    paper_results.append(CC_papers)

In [46]:
# Length of paper_results list after executing Computational Complexity code

len(paper_results)

109451

In [47]:
# Using of get_results function to append all the papers with 'cs.AR' or Hardware Architecture as category.

AR_results = get_results('cs.AR')

for AR_papers in AR_results:
    paper_results.append(AR_papers)

In [48]:
# Length of paper_results list after executing Hardware Architecture code

len(paper_results)

112931

Once after appending all the papers metadata in to paper_results list. We are now separating and creating list for each of the columns such as titles, corresponding authors, summaries..etc from the paper_results list which contains our metadata. and using pickle package only to view the stored data for the sake of confirmation.

In [49]:
papers = pickle.dumps(paper_results)

pickle.loads(papers)

[arxiv.Result(entry_id='http://arxiv.org/abs/1511.04326v1', updated=datetime.datetime(2015, 11, 12, 20, 4, 31, tzinfo=datetime.timezone.utc), published=datetime.datetime(2015, 11, 12, 20, 4, 31, tzinfo=datetime.timezone.utc), title='ICON Challenge on Algorithm Selection', authors=[arxiv.Result.Author('Lars Kotthoff')], summary='We present the results of the ICON Challenge on Algorithm Selection.', comment=None, journal_ref=None, doi=None, primary_category='cs.AI', categories=['cs.AI'], links=[arxiv.Result.Link('http://arxiv.org/abs/1511.04326v1', title=None, rel='alternate', content_type=None), arxiv.Result.Link('http://arxiv.org/pdf/1511.04326v1', title='pdf', rel='related', content_type=None)]),
 arxiv.Result(entry_id='http://arxiv.org/abs/cs/0003012v1', updated=datetime.datetime(2000, 3, 6, 22, 23, tzinfo=datetime.timezone.utc), published=datetime.datetime(2000, 3, 6, 22, 23, tzinfo=datetime.timezone.utc), title='Defeasible Reasoning in OSCAR', authors=[arxiv.Result.Author('John L. 

In [50]:
#Extracting all the titles
titles_list = []

for i in paper_results:
    titles_list.append(i.title)


In [51]:
#Extracting summaries
summary_list = []

for i in paper_results:
    summary_list.append(i.summary)
    

In [52]:
#Extracting comments
comments_list = []

for i in paper_results:
    comments_list.append(i.comment)
    


In [53]:
#Extracting journal references

journal_ref_list = []

for i in paper_results:
    journal_ref_list.append(i.journal_ref)
    


In [54]:
#Extracting DOIs
doi_list = []

for i in paper_results:
    doi_list.append(i.doi)
    


In [55]:
#Extracting Entry IDs

entry_ids_list = []

for i in paper_results:
    entry_ids_list.append(i.entry_id)
    


In [56]:
#Extracting Authors

authors_list = []

for i in paper_results:
    authors_list.append(str(i.authors))
    

In [57]:
#Extracting last updated dates
last_updated_list = []

for i in paper_results:
    last_updated_list.append(str(i.updated))
    


In [58]:
#Extracting published dates

published_date_list = []

for i in paper_results:
    published_date_list.append(str(i.published))
    


In [59]:
#Extracting the primary category of the papers

primary_category_list = []

for i in paper_results:
    primary_category_list.append(i.primary_category)
    

In [60]:
#Extracting all the categories papers could be attributed to

category_list = []

for i in paper_results:
    category_list.append(str(i.categories))
    

In [61]:
#Extracting the links which contains both pdf and abs link

links_list = []

for i in paper_results:
        links_list.append(str(i.links))
    

In [62]:
# Extracting the pdf link of papers

pdf_url_list = []

for i in paper_results:
        pdf_url_list.append(i.pdf_url)
    

After creating lists for each of the attributes. We are now going to stack them as columns for a dataframe that hosts all this data called papers_df 

In [65]:
#I am using the np.column_stack to stack these lists as columns in to single dataframe.

papers_df = pd.DataFrame(np.column_stack([titles_list,
                                          authors_list,
                                          summary_list,
                                          comments_list,
                                          journal_ref_list,
                                          doi_list,
                                          entry_ids_list,
                                          last_updated_list,
                                          published_date_list,
                                          primary_category_list,
                                          category_list,
                                          links_list,
                                          pdf_url_list]),
                         columns=['Title',
                                  'Author',
                                  'Summary',
                                  'Comment',
                                  'Journal_ref',
                                  'DOI',
                                  'Entry_ID',
                                  'Last_updated',
                                  'Published_date',
                                  'Primary_category',
                                  'Category',
                                  'Links',
                                  'PDF_URL'])

In [66]:
papers_df

Unnamed: 0,Title,Author,Summary,Comment,Journal_ref,DOI,Entry_ID,Last_updated,Published_date,Primary_category,Category,Links,PDF_URL
0,ICON Challenge on Algorithm Selection,[arxiv.Result.Author('Lars Kotthoff')],We present the results of the ICON Challenge o...,,,,http://arxiv.org/abs/1511.04326v1,2015-11-12 20:04:31+00:00,2015-11-12 20:04:31+00:00,cs.AI,['cs.AI'],[arxiv.Result.Link('http://arxiv.org/abs/1511....,http://arxiv.org/pdf/1511.04326v1
1,Defeasible Reasoning in OSCAR,[arxiv.Result.Author('John L. Pollock')],This is a system description for the OSCAR def...,"Nonmonotonic Reasoning Workshop, 2000",,,http://arxiv.org/abs/cs/0003012v1,2000-03-06 22:23:00+00:00,2000-03-06 22:23:00+00:00,cs.AI,"['cs.AI', 'F.4.1']",[arxiv.Result.Link('http://arxiv.org/abs/cs/00...,http://arxiv.org/pdf/cs/0003012v1
2,A note on Darwiche and Pearl,[arxiv.Result.Author('Daniel Lehmann')],It is shown that Darwiche and Pearl's postulat...,A small unpublished remark on a paper by Darwi...,,,http://arxiv.org/abs/cs/0202024v1,2002-02-18 15:23:06+00:00,2002-02-18 15:23:06+00:00,cs.AI,"['cs.AI', 'I.2.3']",[arxiv.Result.Link('http://arxiv.org/abs/cs/02...,http://arxiv.org/pdf/cs/0202024v1
3,Utility-Probability Duality,"[arxiv.Result.Author('Ali Abbas'), arxiv.Resul...",This paper presents duality between probabilit...,,,,http://arxiv.org/abs/cs/0311004v1,2003-11-06 07:33:23+00:00,2003-11-06 07:33:23+00:00,cs.AI,"['cs.AI', 'G.3.3']",[arxiv.Result.Link('http://arxiv.org/abs/cs/03...,http://arxiv.org/pdf/cs/0311004v1
4,A primer on Answer Set Programming,[arxiv.Result.Author('Alessandro Provetti')],A introduction to the syntax and Semantics of ...,6 pages,,,http://arxiv.org/abs/cs/0508100v1,2005-08-23 15:05:12+00:00,2005-08-23 15:05:12+00:00,cs.AI,"['cs.AI', 'cs.LO', 'D.1.6; I.2.3']",[arxiv.Result.Link('http://arxiv.org/abs/cs/05...,http://arxiv.org/pdf/cs/0508100v1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112926,Benchmarking Edge Computing Devices for Grape ...,[arxiv.Result.Author('Sandro Costa Magalhães')...,Purpose: Visual perception enables robots to p...,,"EAAI, 117, 105604 (2022)",10.1016/j.engappai.2022.105604,http://arxiv.org/abs/2211.11647v1,2022-11-21 17:02:33+00:00,2022-11-21 17:02:33+00:00,cs.CV,"['cs.CV', 'cs.AR', 'cs.DC', '62M45, 62P30, 68Q...",[arxiv.Result.Link('http://dx.doi.org/10.1016/...,http://arxiv.org/pdf/2211.11647v1
112927,Profile-Guided Parallel Task Extraction and Ex...,"[arxiv.Result.Author('Liangliang Chang'), arxi...","In this study, we introduce a methodology for ...","8 pages, accepted by ISPA 2022",,,http://arxiv.org/abs/2211.14547v1,2022-11-26 12:01:40+00:00,2022-11-26 12:01:40+00:00,cs.DC,"['cs.DC', 'cs.AR']",[arxiv.Result.Link('http://arxiv.org/abs/2211....,http://arxiv.org/pdf/2211.14547v1
112928,Implementing Neural Network-Based Equalizers i...,"[arxiv.Result.Author('Pedro J. Freire'), arxiv...","In this work, we demonstrate the offline FPGA ...",Invited paper at Journal of Lightwave Technolo...,,,http://arxiv.org/abs/2212.04703v1,2022-12-09 07:28:45+00:00,2022-12-09 07:28:45+00:00,eess.SP,"['eess.SP', 'cs.AR', 'cs.CC', 'cs.LG']",[arxiv.Result.Link('http://arxiv.org/abs/2212....,http://arxiv.org/pdf/2212.04703v1
112929,First-Generation Inference Accelerator Deploym...,"[arxiv.Result.Author('Michael Anderson'), arxi...","In this paper, we provide a deep dive into the...",,,,http://arxiv.org/abs/2107.04140v3,2021-08-04 21:51:51+00:00,2021-07-08 22:52:42+00:00,cs.AR,['cs.AR'],[arxiv.Result.Link('http://arxiv.org/abs/2107....,http://arxiv.org/pdf/2107.04140v3


Filtering the data according to our needs. i.e Published date years (2017 - 2021) & primary category of ('cs.AI','cs.LG','cs.CC','cs.AR')

In [67]:
# First filteration of published dates from 2017 to 2021.

papers_df = papers_df[(papers_df['Published_date'] > '2017-01-01') & (papers_df['Published_date'] < '2021-12-31')]


In [68]:
#2nd filteration step. primary category should only be cs.AI|cs.CC|cs.LG|cs.AR

papers_df = papers_df[papers_df['Primary_category'].str.contains('cs.AI|cs.CC|cs.LG|cs.AR')]


In [69]:
papers_df

Unnamed: 0,Title,Author,Summary,Comment,Journal_ref,DOI,Entry_ID,Last_updated,Published_date,Primary_category,Category,Links,PDF_URL
21,A Tutorial on Modular Ontology Modeling with O...,"[arxiv.Result.Author('Pascal Hitzler'), arxiv....",We provide a detailed example for modular onto...,,,,http://arxiv.org/abs/1808.08433v1,2018-08-25 14:36:00+00:00,2018-08-25 14:36:00+00:00,cs.AI,['cs.AI'],[arxiv.Result.Link('http://arxiv.org/abs/1808....,http://arxiv.org/pdf/1808.08433v1
23,The Book of Why: Review,[arxiv.Result.Author('Joseph Y. Halpern')],"This is a review of ""The Book of Why"", by Jude...","To appear in ""Artificial Intelligence"" journal",,,http://arxiv.org/abs/1909.13485v1,2019-09-30 07:11:50+00:00,2019-09-30 07:11:50+00:00,cs.AI,['cs.AI'],[arxiv.Result.Link('http://arxiv.org/abs/1909....,http://arxiv.org/pdf/1909.13485v1
24,AI Buzzwords Explained: Multi-Agent Path Findi...,"[arxiv.Result.Author('Hang Ma'), arxiv.Result....","Explanation of the hot topic ""multi-agent path...",,,10.1145/3137574.3137579,http://arxiv.org/abs/1710.03774v2,2017-10-17 00:21:44+00:00,2017-10-10 18:24:34+00:00,cs.AI,"['cs.AI', 'cs.MA', 'cs.RO']",[arxiv.Result.Link('http://dx.doi.org/10.1145/...,http://arxiv.org/pdf/1710.03774v2
25,AAAI FSS-18: Artificial Intelligence in Govern...,"[arxiv.Result.Author('Frank Stein'), arxiv.Res...",Proceedings of the AAAI Fall Symposium on Arti...,,,,http://arxiv.org/abs/1810.06018v1,2018-10-14 11:40:30+00:00,2018-10-14 11:40:30+00:00,cs.AI,['cs.AI'],[arxiv.Result.Link('http://arxiv.org/abs/1810....,http://arxiv.org/pdf/1810.06018v1
26,Self-Learned Formula Synthesis in Set Theory,"[arxiv.Result.Author('Chad E. Brown'), arxiv.R...",A reinforcement learning algorithm accomplishe...,,,,http://arxiv.org/abs/1912.01525v1,2019-12-03 16:56:51+00:00,2019-12-03 16:56:51+00:00,cs.AI,"['cs.AI', 'cs.LO']",[arxiv.Result.Link('http://arxiv.org/abs/1912....,http://arxiv.org/pdf/1912.01525v1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112872,Brain-inspired Cognition in Next Generation Ra...,"[arxiv.Result.Author('Asif Ali Khan'), arxiv.R...",Hyperdimensional computing (HDC) is an emergin...,"Preprint, accepted for publication, ACM Transa...",,10.1145/3524071,http://arxiv.org/abs/2111.02246v2,2022-03-15 22:34:53+00:00,2021-11-03 14:21:39+00:00,cs.LG,"['cs.LG', 'cs.AR', 'cs.ET']",[arxiv.Result.Link('http://dx.doi.org/10.1145/...,http://arxiv.org/pdf/2111.02246v2
112873,MC-CIM: Compute-in-Memory with Monte-Carlo Dro...,"[arxiv.Result.Author('Priyesh Shukla'), arxiv....","We propose MC-CIM, a compute-in-memory (CIM) f...",,,,http://arxiv.org/abs/2111.07125v1,2021-11-13 14:50:48+00:00,2021-11-13 14:50:48+00:00,cs.LG,"['cs.LG', 'cs.AR', 'cs.RO', 'eess.IV']",[arxiv.Result.Link('http://arxiv.org/abs/2111....,http://arxiv.org/pdf/2111.07125v1
112878,Logic Shrinkage: Learned FPGA Netlist Sparsity...,"[arxiv.Result.Author('Erwei Wang'), arxiv.Resu...",FPGA-specific DNN architectures using the nati...,Accepted manuscript uploaded 04/12/21. DOA 22/...,,10.1145/3490422.3502360,http://arxiv.org/abs/2112.02346v2,2022-01-02 12:19:38+00:00,2021-12-04 14:23:24+00:00,cs.LG,"['cs.LG', 'cs.AR']",[arxiv.Result.Link('http://dx.doi.org/10.1145/...,http://arxiv.org/pdf/2112.02346v2
112929,First-Generation Inference Accelerator Deploym...,"[arxiv.Result.Author('Michael Anderson'), arxi...","In this paper, we provide a deep dive into the...",,,,http://arxiv.org/abs/2107.04140v3,2021-08-04 21:51:51+00:00,2021-07-08 22:52:42+00:00,cs.AR,['cs.AR'],[arxiv.Result.Link('http://arxiv.org/abs/2107....,http://arxiv.org/pdf/2107.04140v3


Cleaning : Our dataframe is shaping up. But results had 'arxiv.result.Author' in side the cells. We are now cleaning the data to see authors names as is, in the next step.

In [70]:
papers_df['Author'] = papers_df['Author'].str.replace('arxiv.Result.Author', '')

papers_df['Author'] = papers_df['Author'].str.replace('[', '')

papers_df['Author'] = papers_df['Author'].str.replace(']', '')

papers_df['Author'] = papers_df['Author'].str.replace('(', '')

papers_df['Author'] = papers_df['Author'].str.replace(')', '')

papers_df['Links'] = papers_df['Links'].str.replace('arxiv.Result.Link', '')

papers_df['Links'] = papers_df['Links'].str.replace('[', '')

papers_df['Links'] = papers_df['Links'].str.replace(']', '')

papers_df['Links'] = papers_df['Links'].str.replace('(', '')

papers_df['Links'] = papers_df['Links'].str.replace(')', '')

papers_df['Category'] = papers_df['Category'].str.replace('[', '')

papers_df['Category'] = papers_df['Category'].str.replace(']', '')



  papers_df['Author'] = papers_df['Author'].str.replace('arxiv.Result.Author', '')
  papers_df['Author'] = papers_df['Author'].str.replace('[', '')
  papers_df['Author'] = papers_df['Author'].str.replace(']', '')
  papers_df['Author'] = papers_df['Author'].str.replace('(', '')
  papers_df['Author'] = papers_df['Author'].str.replace(')', '')
  papers_df['Links'] = papers_df['Links'].str.replace('arxiv.Result.Link', '')
  papers_df['Links'] = papers_df['Links'].str.replace('[', '')
  papers_df['Links'] = papers_df['Links'].str.replace(']', '')
  papers_df['Links'] = papers_df['Links'].str.replace('(', '')
  papers_df['Links'] = papers_df['Links'].str.replace(')', '')
  papers_df['Category'] = papers_df['Category'].str.replace('[', '')
  papers_df['Category'] = papers_df['Category'].str.replace(']', '')


In [71]:
papers_df

Unnamed: 0,Title,Author,Summary,Comment,Journal_ref,DOI,Entry_ID,Last_updated,Published_date,Primary_category,Category,Links,PDF_URL
21,A Tutorial on Modular Ontology Modeling with O...,"'Pascal Hitzler', 'Adila Krisnadhi'",We provide a detailed example for modular onto...,,,,http://arxiv.org/abs/1808.08433v1,2018-08-25 14:36:00+00:00,2018-08-25 14:36:00+00:00,cs.AI,'cs.AI',"'http://arxiv.org/abs/1808.08433v1', title=Non...",http://arxiv.org/pdf/1808.08433v1
23,The Book of Why: Review,'Joseph Y. Halpern',"This is a review of ""The Book of Why"", by Jude...","To appear in ""Artificial Intelligence"" journal",,,http://arxiv.org/abs/1909.13485v1,2019-09-30 07:11:50+00:00,2019-09-30 07:11:50+00:00,cs.AI,'cs.AI',"'http://arxiv.org/abs/1909.13485v1', title=Non...",http://arxiv.org/pdf/1909.13485v1
24,AI Buzzwords Explained: Multi-Agent Path Findi...,"'Hang Ma', 'Sven Koenig'","Explanation of the hot topic ""multi-agent path...",,,10.1145/3137574.3137579,http://arxiv.org/abs/1710.03774v2,2017-10-17 00:21:44+00:00,2017-10-10 18:24:34+00:00,cs.AI,"'cs.AI', 'cs.MA', 'cs.RO'","'http://dx.doi.org/10.1145/3137574.3137579', t...",http://arxiv.org/pdf/1710.03774v2
25,AAAI FSS-18: Artificial Intelligence in Govern...,"'Frank Stein', 'Alun Preece', 'Mihai Boicu'",Proceedings of the AAAI Fall Symposium on Arti...,,,,http://arxiv.org/abs/1810.06018v1,2018-10-14 11:40:30+00:00,2018-10-14 11:40:30+00:00,cs.AI,'cs.AI',"'http://arxiv.org/abs/1810.06018v1', title=Non...",http://arxiv.org/pdf/1810.06018v1
26,Self-Learned Formula Synthesis in Set Theory,"'Chad E. Brown', 'Thibault Gauthier'",A reinforcement learning algorithm accomplishe...,,,,http://arxiv.org/abs/1912.01525v1,2019-12-03 16:56:51+00:00,2019-12-03 16:56:51+00:00,cs.AI,"'cs.AI', 'cs.LO'","'http://arxiv.org/abs/1912.01525v1', title=Non...",http://arxiv.org/pdf/1912.01525v1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112872,Brain-inspired Cognition in Next Generation Ra...,"'Asif Ali Khan', 'Sebastien Ollivier', 'Stephe...",Hyperdimensional computing (HDC) is an emergin...,"Preprint, accepted for publication, ACM Transa...",,10.1145/3524071,http://arxiv.org/abs/2111.02246v2,2022-03-15 22:34:53+00:00,2021-11-03 14:21:39+00:00,cs.LG,"'cs.LG', 'cs.AR', 'cs.ET'","'http://dx.doi.org/10.1145/3524071', title='do...",http://arxiv.org/pdf/2111.02246v2
112873,MC-CIM: Compute-in-Memory with Monte-Carlo Dro...,"'Priyesh Shukla', 'Shamma Nasrin', 'Nastaran D...","We propose MC-CIM, a compute-in-memory (CIM) f...",,,,http://arxiv.org/abs/2111.07125v1,2021-11-13 14:50:48+00:00,2021-11-13 14:50:48+00:00,cs.LG,"'cs.LG', 'cs.AR', 'cs.RO', 'eess.IV'","'http://arxiv.org/abs/2111.07125v1', title=Non...",http://arxiv.org/pdf/2111.07125v1
112878,Logic Shrinkage: Learned FPGA Netlist Sparsity...,"'Erwei Wang', 'James J. Davis', 'Georgios-Ilia...",FPGA-specific DNN architectures using the nati...,Accepted manuscript uploaded 04/12/21. DOA 22/...,,10.1145/3490422.3502360,http://arxiv.org/abs/2112.02346v2,2022-01-02 12:19:38+00:00,2021-12-04 14:23:24+00:00,cs.LG,"'cs.LG', 'cs.AR'","'http://dx.doi.org/10.1145/3490422.3502360', t...",http://arxiv.org/pdf/2112.02346v2
112929,First-Generation Inference Accelerator Deploym...,"'Michael Anderson', 'Benny Chen', 'Stephen Che...","In this paper, we provide a deep dive into the...",,,,http://arxiv.org/abs/2107.04140v3,2021-08-04 21:51:51+00:00,2021-07-08 22:52:42+00:00,cs.AR,'cs.AR',"'http://arxiv.org/abs/2107.04140v3', title=Non...",http://arxiv.org/pdf/2107.04140v3


Finally saving the output to a csv file. This csv file is used in task 2

In [72]:
#finally saving the output to a csv file.

papers_df.to_csv('Arxiv_Papers_Results.csv',index = False)