In [1]:
import arxiv
import seaborn as sns
import pandas as pd

pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 300)

In [2]:
client = arxiv.Client(
    page_size=500,
    delay_seconds=3,
    num_retries=10
)

In [3]:
results = {
    "forecast": [],
    #"time series": []
}

for tag in results:
    for result in client.results(
        arxiv.Search(
            query=tag,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )
    ):
        if result.published.year < 2021:
            break
        results[tag].append(result)

In [4]:
for tag in results:
    print(f"tag: {tag} {len(results[tag])}")

tag: forecast 1782


In [5]:
df = pd.DataFrame([i.__dict__ for tag in results for i in results[tag]])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1782 entries, 0 to 1781
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   entry_id          1782 non-null   object             
 1   updated           1782 non-null   datetime64[ns, UTC]
 2   published         1782 non-null   datetime64[ns, UTC]
 3   title             1782 non-null   object             
 4   authors           1782 non-null   object             
 5   summary           1782 non-null   object             
 6   comment           1033 non-null   object             
 7   journal_ref       177 non-null    object             
 8   doi               318 non-null    object             
 9   primary_category  1782 non-null   object             
 10  categories        1782 non-null   object             
 11  links             1782 non-null   object             
 12  pdf_url           1782 non-null   object             
 13  _ra

In [7]:
df = df[df.primary_category.str.contains("(cs\.|stat\.)", case=False)]
df = df[~df.primary_category.str.contains("physics", case=False)]
df = df.drop_duplicates(subset="title", keep="last")

  return func(self, *args, **kwargs)


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1012 entries, 2 to 1780
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   entry_id          1012 non-null   object             
 1   updated           1012 non-null   datetime64[ns, UTC]
 2   published         1012 non-null   datetime64[ns, UTC]
 3   title             1012 non-null   object             
 4   authors           1012 non-null   object             
 5   summary           1012 non-null   object             
 6   comment           530 non-null    object             
 7   journal_ref       79 non-null     object             
 8   doi               106 non-null    object             
 9   primary_category  1012 non-null   object             
 10  categories        1012 non-null   object             
 11  links             1012 non-null   object             
 12  pdf_url           1012 non-null   object             
 13  _ra

In [9]:
from utils import pipeline_normalize, get_citation_index, arxiv_published_flag
from nltk import ngrams

In [10]:
df = (
    df
    .pipe(lambda x: x.assign(parsed_title = x.title.apply(pipeline_normalize)))
    .pipe(lambda x: x.assign(parsed_summary = x.summary.apply(pipeline_normalize)))
    .pipe(lambda x: x.assign(parsed_title_2gram = x.parsed_title.apply(lambda x: list(ngrams(x, 2)))))
    .pipe(lambda x: x.assign(parsed_summary_2gram = x.parsed_summary.apply(lambda x: list(ngrams(x, 2)))))
)

In [11]:
df["citation_index"] = df.doi.apply(get_citation_index)

10.1145/3488560.3498511
10.1145/3492323.3495588
10.5445/KSP/1000138532
10.1080/21645515.2021.2017216
10.1285/i20705948v14n1p230
10.1109/ICASSP39728.2021.9414118.
10.5281/zenodo.4478251
10.37421/jhmi.2020.11.342 10.37421/jhmi.2020.11.342


In [12]:
df["published_flag"] = df.apply(lambda x: arxiv_published_flag(x.comment, x.doi), axis=1)

In [13]:
df.primary_category.value_counts().head(14)

cs.LG      492
stat.AP     96
cs.CV       90
stat.ME     78
stat.ML     48
cs.AI       38
cs.SI       24
cs.RO       23
cs.CL       22
cs.CY       18
cs.NI       12
cs.DL        9
cs.DC        9
stat.CO      9
Name: primary_category, dtype: int64

## Popular bigrams

In [14]:
df.parsed_summary_2gram.explode().value_counts().head(20)

(time, seri)         805
(neural, network)    410
(state, art)         264
(machin, learn)      254
(deep, learn)        242
(real, world)        192
(seri, forecast)     175
(short, term)        160
(forecast, model)    158
(learn, model)       141
(spatio, tempor)     108
(paper, propos)      107
(spatial, tempor)     91
(long, term)          89
(real, time)          88
(result, show)        87
(long, short)         85
(recurr, neural)      85
(data, driven)        85
(term, memori)        82
Name: parsed_summary_2gram, dtype: int64

In [15]:
df.parsed_title_2gram.explode().value_counts().head(20)

(time, seri)              177
(neural, network)          85
(seri, forecast)           78
(deep, learn)              51
(machin, learn)            50
(spatio, tempor)           35
(short, term)              34
(forecast, use)            28
(graph, neural)            25
(traffic, forecast)        22
(data, driven)             18
(graph, convolut)          17
(learn, model)             17
(load, forecast)           16
(convolut, network)        15
(trajectori, predict)      15
(recurr, neural)           15
(trajectori, forecast)     14
(network, traffic)         14
(case, studi)              13
Name: parsed_title_2gram, dtype: int64

In [16]:
(
    df[~df.doi.isnull()]
    .sort_values(by="citation_index", ascending=False)[["title", "citation_index", "published_flag"]]
    .head(20)
)

Unnamed: 0,title,citation_index,published_flag
1168,A better measure of relative prediction accuracy for model selection and model estimation,178.0,10.1057/jors.2014.103
1164,The Semantic Brand Score,33.0,10.1016/j.jbusres.2018.03.026
1130,Forecasting managerial turnover through e-mail based social network analysis,32.0,10.1016/j.chb.2017.02.017
1400,An Experimental Review on Deep Learning Architectures for Time Series Forecasting,23.0,10.1142/S0129065721300011
571,Weak Signals in the Mobility Landscape: Car Sharing in Ten European Cities,11.0,10.1140/epjds/s13688-019-0186-8
1733,Learning to Anticipate Egocentric Actions by Imagination,8.0,10.1109/TIP.2020.3040521
1165,Forecasting election results by studying brand importance in online news,8.0,10.1016/j.ijforecast.2019.05.013
1474,The RECIPE Approach to Challenges in Deeply Heterogeneous High Performance Systems,8.0,10.1016/j.micpro.2020.103185
1235,Time series forecasting of new cases and new deaths rate for COVID-19 using deep learning methods,8.0,10.1016/j.rinp.2021.104495
1646,Gesture Recognition in Robotic Surgery: a Review,7.0,10.1109/TBME.2021.3054828


In [19]:
(
    df[~df.published_flag.isnull()]
    .pipe(lambda x: x[lambda y: y.parsed_summary_2gram.apply(lambda z: ("time", "seri") in z)])
    .sort_values(by="title")[["title", "citation_index", "published_flag"]]
)

Unnamed: 0,title,citation_index,published_flag
1353,A Comparative Analysis of Machine Learning and Grey Models,,[journal]
1262,A Comparative Study of Using Spatial-Temporal Graph Convolutional Networks for Predicting Availability in Bike Sharing Schemes,,"[ieee, accepted]"
178,A Daily Tourism Demand Prediction Framework Based on Multi-head Attention CNN: The Case of The Foreign Entrant in South Korea,,"[ieee, accepted]"
582,A Robust and Efficient Multi-Scale Seasonal-Trend Decomposition,1.0,10.1109/ICASSP39728.2021.9413939
1153,A Spatio-Temporal Model for Predicting Wind Speeds in Southern California,,[journal]
611,A Study of Joint Graph Inference and Forecasting,,"[icml, workshop]"
1769,A Trainable Reconciliation Method for Hierarchical Time-Series,,"[conference, accepted]"
1252,A study on Ensemble Learning for Time Series Forecasting and the need for Meta-Learning,,[accepted]
1291,A systematic review of Python packages for time series analysis,,[accepted]
1652,AGSTN: Learning Attention-adjusted Graph Spatio-Temporal Networks for Short-term Urban Sensor Value Forecasting,,"[code, ieee]"


In [20]:
(
    df[~df.published_flag.isnull()]
    .pipe(lambda x: x[lambda y: y.parsed_summary_2gram.apply(lambda z: ("neural", "network") in z)])
    .sort_values(by="title")[["title", "citation_index", "published_flag"]]
)

Unnamed: 0,title,citation_index,published_flag
1262,A Comparative Study of Using Spatial-Temporal Graph Convolutional Networks for Predicting Availability in Bike Sharing Schemes,,"[ieee, accepted]"
261,A Comparative Study on Basic Elements of Deep Learning Models for Spatial-Temporal Traffic Forecasting,,[workshop]
178,A Daily Tourism Demand Prediction Framework Based on Multi-head Attention CNN: The Case of The Foreign Entrant in South Korea,,"[ieee, accepted]"
1467,A Deep-Learning Framework to Predict the Dynamics of a Human-Driven Vehicle Based on the Road Geometry,,"[submitted, ieee, publication]"
611,A Study of Joint Graph Inference and Forecasting,,"[icml, workshop]"
1769,A Trainable Reconciliation Method for Hierarchical Time-Series,,"[conference, accepted]"
1741,A deep learning modeling framework to capture mixing patterns in reactive-transport systems,0.0,10.4208/cicp.OA-2021-0088
1652,AGSTN: Learning Attention-adjusted Graph Spatio-Temporal Networks for Short-term Urban Sensor Value Forecasting,,"[code, ieee]"
680,Adaptive Explainable Continual Learning Framework for Regression Problems with Focus on Power Forecasts,,[accepted]
1400,An Experimental Review on Deep Learning Architectures for Time Series Forecasting,23.0,10.1142/S0129065721300011
