## Arxiv Paper Dataframe by Crawling

<br>

In [None]:
%pip install arxiv
%pip install clipboard
%pip install pyautogui

In [1]:
import re
import tqdm
import arxiv
import clipboard
import pyautogui
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta

In [2]:
def make_arxiv_paper_df_with_abstract(paper_ids):

    arxiv_paper_df_with_abstract = pd.DataFrame({'Title':['Noun'],
                               'Journal/Conference':['Noun'],
                               'Date':['Noun'], 
                               'Author':['Noun'],
                               'Link':['Noun'],
                               'Abstract':['Noun']})
    client = arxiv.Client()
    
    pbar = tqdm.tqdm(paper_ids)

    for idx, paper_id in enumerate(pbar):
        search = arxiv.Search(id_list=[paper_id])
        paper = next(client.results(search))
        
        paper_journal_conf = re.search(r'[A-Z ]+[0-9]+[0-9]+[0-9]+[0-9]', str(paper.comment))
        if paper_journal_conf != None:
            paper_journal_conf = paper_journal_conf.group().strip()
            if len(paper_journal_conf) > 4:
                if paper_journal_conf[-4] != " ":
                    paper_journal_conf = paper_journal_conf[:-4] + " " + paper_journal_conf[-4:]
                else:
                    paper_journal_conf = paper_journal_conf
            elif len(paper_journal_conf) <= 4:
                paper_journal_conf = ""
        elif paper_journal_conf == None:
            paper_journal_conf = ""

        arxiv_paper_df_with_abstract.loc[idx] = [paper.title, 
                                paper_journal_conf,
                                paper.published.date(), 
                                str(paper.authors[0]) + ' et al',
                                    paper.entry_id,
                                    paper.summary]
    pbar.close()
    
    arxiv_paper_df_with_abstract = pd.DataFrame(arxiv_paper_df_with_abstract.sort_values(by='Date').reset_index()).drop(['index'], axis='columns')
    arxiv_paper_df_with_abstract.index = np.arange(1, len(arxiv_paper_df_with_abstract) + 1)         
    
    return arxiv_paper_df_with_abstract

In [3]:
def str_convert_datetime(date):
    return datetime.strptime(date, '%Y-%m-%d').date()

In [4]:
def add_other_papers_column(arxiv_paper_df_with_abstract, other_papers):
  
  df_length = len(arxiv_paper_df_with_abstract) - 1

  pbar = tqdm.tqdm(other_papers)

  for other_paper in pbar:
    df_length += 1
    arxiv_paper_df_with_abstract.loc[df_length] = other_paper
  
  arxiv_paper_df_with_abstract = pd.DataFrame(arxiv_paper_df_with_abstract.sort_values(by='Date').reset_index()).drop(['index'], axis='columns')
  arxiv_paper_df_with_abstract.index = np.arange(1, len(arxiv_paper_df_with_abstract) + 1)  
  
  pbar.close()
  
  return arxiv_paper_df_with_abstract

In [5]:
def hyperlink(x):
    hyperlink= '[Link]' + '(' + x + ')'
    return hyperlink

In [6]:
def input_jouranl_conference_theme(arxiv_paper_df_with_abstract):

    paper_title = arxiv_paper_df_with_abstract['Title']
    paper_journal_conference = arxiv_paper_df_with_abstract['Journal/Conference']
    arxiv_paper_df_with_abstract['Theme'] = ""
    paper_theme = arxiv_paper_df_with_abstract['Theme']

    pyautogui.alert('Input Paper Jouranl Conference')

    for index, (title, journal_conference) in enumerate(zip(paper_title, paper_journal_conference)):

        if len(journal_conference) < 2:
            clipboard.copy(title)
            input_journal_conference = input("{} For {}: ".format("Input Journal & Conference", title)) 
            arxiv_paper_df_with_abstract.loc[index, 'Journal/Conference'] = input_journal_conference

    pyautogui.alert('Input Paper Theme')

    for index, (title, theme) in enumerate(zip(paper_title, paper_theme)):

        if len(theme) < 2:
            clipboard.copy(title)
            input_theme = input("{} For {}: ".format("Input Theme", title)) 
            arxiv_paper_df_with_abstract.loc[index, 'Theme'] = input_theme

    arxiv_paper_df_with_abstract = arxiv_paper_df_with_abstract[['Title', 'Journal/Conference', 'Date', 'Author', 'Theme', 'Link', 'Abstract']]

    return arxiv_paper_df_with_abstract

In [7]:
def make_arxiv_paper_df_with_abstract_by_theme(theme_order, arxiv_paper_df_with_abstract):

    def sorter(column):
        mapper = {name: order for order, name in enumerate(theme_order)}
        return column.map(mapper)

    arxiv_paper_df_with_abstract_by_theme = arxiv_paper_df_with_abstract.sort_values(by=['Theme', 'Date'], key=sorter, ascending=True).reset_index() 
    del arxiv_paper_df_with_abstract_by_theme['index']
    arxiv_paper_df_with_abstract_by_theme.index += 1 
    arxiv_paper_df_with_abstract_by_theme = arxiv_paper_df_with_abstract_by_theme.set_index('Theme', append=True).swaplevel(0, 1)

    return arxiv_paper_df_with_abstract_by_theme

In [8]:
def make_arxiv_paper_df(arxiv_paper_df_with_abstract):

    arxiv_paper_df_with_abstract = pd.DataFrame(arxiv_paper_df_with_abstract.sort_values(by='Date').reset_index()).drop(['index'], axis='columns')
    arxiv_paper_df_with_abstract.index = np.arange(1, len(arxiv_paper_df_with_abstract) + 1)    
    arxiv_paper_df = arxiv_paper_df_with_abstract.drop(['Abstract'], axis='columns')

    return arxiv_paper_df

In [9]:
def make_arxiv_paper_df_by_theme(arxiv_paper_df_with_abstract_by_theme):
  
    arxiv_paper_df_by_theme = arxiv_paper_df_with_abstract_by_theme.drop(['Abstract'], axis='columns')

    return arxiv_paper_df_by_theme

In [16]:
paper_ids = ["2403.09032", "2403.08295", "1911.02150", "2104.09864",
             "2205.01543", "1908.10084", "2310.06825", "2110.05679",
             "1610.05820", "2012.07805", "1611.03530", "2305.04388",
             "2010.06053", "1607.00133", "2009.03106", "2011.11660",
             "1510.01799", "2206.11309", "1801.07243", "1706.09254",
             "1412.6980", "1908.08345", "2210.03992", "2105.09680",
             "2106.14448", "2307.13304", "2205.03835", "1704.04368",
             "1611.04230", "2205.11315", "2004.12832", "2212.09114",
             "2212.09114", "1906.00300", "2004.04906", "2012.12624", 
             "1704.00051", "2208.04232", "2104.08663", "2203.05794",
             "1904.08375", "2203.08372", "1805.04833", "2108.05540",
             "2104.00369", "1904.09675", "2004.04696", "1804.08771",
             "2304.11015", "2106.15339", "1901.11196", "2003.02245",
             "2004.12239", "1904.09545", "2105.07624", "2305.02301",
             "1711.09846", "1908.07442", "2012.06678", "2207.08815",
             "1603.02754", "2301.13808", "2004.02349", "2112.07337",
             "2205.14690", "2207.03637", "1608.03983", "1508.00305",
             "2004.14373", "1709.00103", "1511.06335"]

arxiv_paper_df_with_abstract = make_arxiv_paper_df_with_abstract(paper_ids)

100%|██████████| 71/71 [04:12<00:00,  3.56s/it]


In [17]:
other_papers = [["A Recurrent BERT-based Model for Question Generation", "ACL 2019", str_convert_datetime("2019-01-01"),
                "Ying-Hong Chan et al", "https://aclanthology.org/D19-5821/", 
                "In this study, we investigate the employment of the pre-trained BERT language model to tackle question generation tasks. We introduce three neural architectures built on top of BERT for question generation tasks. The first one is a straightforward BERT employment, which reveals the defects of directly using BERT for text generation. Accordingly, we propose another two models by restructuring our BERT employment into a sequential manner for taking information from previous decoded results. Our models are trained and evaluated on the recent question-answering dataset SQuAD. Experiment results show that our best model yields state-of-the-art performance which advances the BLEU 4 score of the existing best models from 16.85 to 22.17."],
                ["Hierarchical Attention Networks for Document Classification", "NAACL 2016", str_convert_datetime("2016-01-01"),
                "Zichao Yang et al", "https://aclanthology.org/N16-1174/", 
                "We propose a hierarchical attention network for document classification. Our model has two distinctive characteristics: (i) it has a hierarchical structure that mirrors the hierarchical structure of documents; (ii) it has two levels of attention mechanisms applied at the wordand sentence-level, enabling it to attend differentially to more and less important content when constructing the document representation. Experiments conducted on six large scale text classification tasks demonstrate that the proposed architecture outperform previous methods by a substantial margin. Visualization of the attention layers illustrates that the model selects qualitatively informative words and sentences."], 
                ["METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments", "ACL 2005", str_convert_datetime("2005-01-01"),
                "Satanjeev Banerjee et al", "https://aclanthology.org/W05-0909/9", 
                "We describe METEOR, an automatic metric for machine translation evaluation that is based on a generalized concept of unigram matching between the machineproduced translation and human-produced reference translations. Unigrams can be matched based on their surface forms, stemmed forms, and meanings; furthermore, METEOR can be easily extended to include more advanced matching strategies. Once all generalized unigram matches between the two strings have been found, METEOR computes a score for this matching using a combination of unigram-precision, unigram-recall, and a measure of fragmentation that is designed to directly capture how well-ordered the matched words in the machine translation are in relation to the reference. We evaluate METEOR by measuring the correlation between the metric scores and human judgments of translation quality. We compute the Pearson R correlation value between its scores and human quality assessments of the LDC TIDES 2003 Arabic-to-English and Chinese-to-English datasets. We perform segment-bysegment correlation, and show that METEOR gets an R correlation value of 0.347 on the Arabic data and 0.331 on the Chinese data. This is shown to be an improvement on using simply unigramprecision, unigram-recall and their harmonic F1 combination. We also perform experiments to show the relative contributions of the various mapping modules."],
                ["Random Forests", "Machine Learning, Volume 45", str_convert_datetime("2001-01-01"),
                "Leo Breiman", "https://link.springer.com/article/10.1023/A:1010933404324", 
                "Random forests are a combination of tree predictors such that each tree depends on the values of a random vector sampled independently and with the same distribution for all trees in the forest. The generalization error for forests converges a.s. to a limit as the number of trees in the forest becomes large. The generalization error of a forest of tree classifiers depends on the strength of the individual trees in the forest and the correlation between them. Using a random selection of features to split each node yields error rates that compare favorably to Adaboost (Y. Freund & R. Schapire, Machine Learning: Proceedings of the Thirteenth International conference, ***, 148–156), but are more robust with respect to noise. Internal estimates monitor error, strength, and correlation and these are used to show the response to increasing the number of features used in the splitting. Internal estimates are also used to measure variable importance. These ideas are also applicable to regression."],
                 ["DoT: An efficient Double Transformer for NLP tasks with tables", "ACL Findings 2021", str_convert_datetime("2021-01-01"),
                "Syrine Krichene et al", "https://aclanthology.org/2021.findings-acl.289/", 
                "Transformer-based approaches have been successfully used to obtain state-of-the-art accuracy on natural language processing (NLP) tasks with semi-structured tables. These model architectures are typically deep, resulting in slow training and inference, especially for long inputs. To improve efficiency while maintaining a high accuracy, we propose a new architecture, DoT, a double transformer model, that decomposes the problem into two sub-tasks: A shallow pruning transformer that selects the top-K tokens, followed by a deep task-specific transformer that takes as input those K tokens. Additionally, we modify the task-specific attention to incorporate the pruning scores. The two transformers are jointly trained by optimizing the task-specific loss. We run experiments on three benchmarks, including entailment and question-answering. We show that for a small drop of accuracy, DoT improves training and inference time by at least 50%. We also show that the pruning transformer effectively selects relevant tokens enabling the end-to-end model to maintain similar accuracy as slower baseline models. Finally, we analyse the pruning and give some insight into its impact on the task model."],
                 ["Visualizing Data using t-SNE", "JMLR 2008", str_convert_datetime("2008-01-01"),
                "Laurens van der Maaten et al", "https://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf", 
                "We present a new technique called “t-SNE” that visualizes high-dimensional data by giving each datapoint a location in a two or three-dimensional map. The technique is a variation of Stochastic Neighbor Embedding (Hinton and Roweis, 2002) that is much easier to optimize, and produces significantly better visualizations by reducing the tendency to crowd points together in the center of the map. t-SNE is better than existing techniques at creating a single map that reveals structure at many different scales. This is particularly important for high-dimensional data that lie on several different, but related, low-dimensional manifolds, such as images of objects from multiple classes seen from multiple viewpoints. For visualizing the structure of very large data sets, we show how t-SNE can use random walks on neighborhood graphs to allow the implicit structure of all of the data to influence the way in which a subset of the data is displayed. We illustrate the performance of t-SNE on a wide variety of data sets and compare it with many other non-parametric visualization techniques, including Sammon mapping, Isomap, and Locally Linear Embedding. The visualizations produced by t-SNE are significantly better than those produced by the other techniques on almost all of the data sets."] ]
\
arxiv_paper_df_with_abstract = add_other_papers_column(arxiv_paper_df_with_abstract, other_papers)

100%|██████████| 6/6 [00:00<00:00, 854.38it/s]


In [18]:
arxiv_paper_df_with_abstract["Link"] = arxiv_paper_df_with_abstract["Link"].apply(hyperlink)

In [19]:
arxiv_paper_df_with_abstract = input_jouranl_conference_theme(arxiv_paper_df_with_abstract)

In [None]:
arxiv_paper_df_with_abstract.to_excel('arxiv_paper_df_with_abstract.xlsx', index=False)

In [16]:
arxiv_paper_df_with_abstract = pd.read_excel('arxiv_paper_df_with_abstract.xlsx', engine='openpyxl')

In [39]:
theme_order = ["Language Model", "Security", "Benchmark", 
               "Neural Network", "Information Retrieval", 
               "Tabular Learning", "Knowledge Distillation"]

arxiv_paper_df_with_abstract_by_theme = make_arxiv_paper_df_with_abstract_by_theme(theme_order, arxiv_paper_df_with_abstract)

In [41]:
arxiv_paper_df_with_abstract

Unnamed: 0,Title,Journal/Conference,Date,Author,Theme,Link,Abstract
0,Random Forests,"Machine Learning, Volume 45",2001-01-01,Leo Breiman,Benchmark,[Link](https://link.springer.com/article/10.10...,Random forests are a combination of tree predi...
1,METEOR: An Automatic Metric for MT Evaluation ...,ACL 2005,2005-01-01,Satanjeev Banerjee et al,Neural Network,[Link](https://aclanthology.org/W05-0909/9),"We describe METEOR, an automatic metric for ma..."
2,Visualizing Data using t-SNE,ICLR 2015,2008-01-01,Laurens van der Maaten et al,Neural Network,[Link](https://www.jmlr.org/papers/volume9/van...,We present a new technique called “t-SNE” that...
3,Adam: A Method for Stochastic Optimization,ACL 2015,2014-12-22,Diederik P. Kingma et al,Tabular Learning,[Link](http://arxiv.org/abs/1412.6980v9),"We introduce Adam, an algorithm for first-orde..."
4,Compositional Semantic Parsing on Semi-Structu...,,2015-08-03,Panupong Pasupat et al,Neural Network,[Link](http://arxiv.org/abs/1508.00305v1),Two important aspects of semantic parsing for ...
...,...,...,...,...,...,...,...
71,Distilling Step-by-Step! Outperforming Larger ...,ACL 2023,2023-05-03,Cheng-Yu Hsieh et al,Language Model,[Link](http://arxiv.org/abs/2305.02301v2),Deploying large language models (LLMs) is chal...
72,Language Models Don't Always Say What They Thi...,,2023-05-07,Miles Turpin et al,Language Model,[Link](http://arxiv.org/abs/2305.04388v2),Large Language Models (LLMs) can achieve stron...
73,QuIP: 2-Bit Quantization of Large Language Mod...,,2023-07-25,Jerry Chee et al,Language Model,[Link](http://arxiv.org/abs/2307.13304v2),This work studies post-training parameter quan...
74,Mistral 7B,,2023-10-10,Albert Q. Jiang et al,Language Model,[Link](http://arxiv.org/abs/2310.06825v1),"We introduce Mistral 7B v0.1, a 7-billion-para..."


In [42]:
arxiv_paper_df_with_abstract_by_theme

Unnamed: 0_level_0,Unnamed: 1_level_0,Title,Journal/Conference,Date,Author,Link,Abstract
Theme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Language Model,1,Unsupervised Deep Embedding for Clustering Ana...,,2015-11-19,Junyuan Xie et al,[Link](http://arxiv.org/abs/1511.06335v2),Clustering is central to many data-driven appl...
Language Model,2,Understanding deep learning requires rethinkin...,ICLR 2017,2016-11-10,Chiyuan Zhang et al,[Link](http://arxiv.org/abs/1611.03530v2),"Despite their massive size, successful deep ar..."
Language Model,3,Reading Wikipedia to Answer Open-Domain Questions,ACL 2017,2017-03-31,Danqi Chen et al,[Link](http://arxiv.org/abs/1704.00051v2),This paper proposes to tackle open- domain que...
Language Model,4,Population Based Training of Neural Networks,ACL 2018,2017-11-27,Max Jaderberg et al,[Link](http://arxiv.org/abs/1711.09846v2),Neural networks dominate the modern machine le...
Language Model,5,A Call for Clarity in Reporting BLEU Scores,ACL 2018,2018-04-23,Matt Post et al,[Link](http://arxiv.org/abs/1804.08771v2),The field of machine translation faces an unde...
...,...,...,...,...,...,...,...
Tabular Learning,72,OmniTab: Pretraining with Natural and Syntheti...,NeurIPS 2022,2022-07-08,Zhengbao Jiang et al,[Link](http://arxiv.org/abs/2207.03637v1),The information in tables can be an important ...
Tabular Learning,73,CAPSTONE: Curriculum Sampling for Dense Retrie...,EMNLP 2023,2022-12-18,Xingwei He et al,[Link](http://arxiv.org/abs/2212.09114v2),The dual-encoder has become the de facto archi...
Tabular Learning,74,Large Language Models are Versatile Decomposer...,SIGIR 2023,2023-01-31,Yunhu Ye et al,[Link](http://arxiv.org/abs/2301.13808v3),Table-based reasoning has shown remarkable pro...
Knowledge Distillation,75,DIN-SQL: Decomposed In-Context Learning of Tex...,IPS 2023,2023-04-21,Mohammadreza Pourreza et al,[Link](http://arxiv.org/abs/2304.11015v3),There is currently a significant gap between t...


In [43]:
arxiv_paper_df_with_abstract.to_excel("arxiv_paper_df_with_abstract.xlsx")
arxiv_paper_df.to_excel("arxiv_paper_df.xlsx")
# arxiv_paper_df_with_abstract = pd.read_excel("arxiv_paper_df_with_abstract.xlsx", engine='openpyxl')

arxiv_paper_df_with_abstract_by_theme.to_excel("arxiv_paper_df_with_abstract_by_theme.xlsx")
arxiv_paper_df_by_theme.to_excel("arxiv_paper_df_by_theme.xlsx")
# arxiv_paper_df_with_abstract_by_theme = pd.read_excel("arxiv_paper_df_with_abstract_by_theme.xlsx", engine='openpyxl')

### Upload Dataframe on Github

[Excel to Markdown Converter](https://tabletomarkdown.com/convert-spreadsheet-to-markdown/)