In [1]:
import arxiv
import seaborn as sns
import pandas as pd

pd.set_option('display.max_colwidth', 300)
pd.set_option('display.max_rows', 300)

In [2]:
client = arxiv.Client(
    page_size=500,
    delay_seconds=3,
    num_retries=10
)

In [3]:
results = {
    "forecast": [],
    #"time series": []
}

for tag in results:
    for result in client.results(
        arxiv.Search(
            query=tag,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )
    ):
        if result.published.year < 2022:
            break
        results[tag].append(result)

In [4]:
for tag in results:
    print(f"tag: {tag} {len(results[tag])}")

tag: forecast 1833


In [5]:
df = pd.DataFrame([i.__dict__ for tag in results for i in results[tag]])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1833 entries, 0 to 1832
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   entry_id          1833 non-null   object             
 1   updated           1833 non-null   datetime64[ns, UTC]
 2   published         1833 non-null   datetime64[ns, UTC]
 3   title             1833 non-null   object             
 4   authors           1833 non-null   object             
 5   summary           1833 non-null   object             
 6   comment           1032 non-null   object             
 7   journal_ref       156 non-null    object             
 8   doi               312 non-null    object             
 9   primary_category  1833 non-null   object             
 10  categories        1833 non-null   object             
 11  links             1833 non-null   object             
 12  pdf_url           1833 non-null   object             
 13  _ra

In [7]:
df = df[df.primary_category.str.contains("(cs\.|stat\.)", case=False)]
df = df[~df.primary_category.str.contains("physics", case=False)]
df = df.drop_duplicates(subset="title", keep="last")

  df = df[df.primary_category.str.contains("(cs\.|stat\.)", case=False)]


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1045 entries, 1 to 1832
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   entry_id          1045 non-null   object             
 1   updated           1045 non-null   datetime64[ns, UTC]
 2   published         1045 non-null   datetime64[ns, UTC]
 3   title             1045 non-null   object             
 4   authors           1045 non-null   object             
 5   summary           1045 non-null   object             
 6   comment           524 non-null    object             
 7   journal_ref       71 non-null     object             
 8   doi               99 non-null     object             
 9   primary_category  1045 non-null   object             
 10  categories        1045 non-null   object             
 11  links             1045 non-null   object             
 12  pdf_url           1045 non-null   object             
 13  _ra

In [9]:
from utils import pipeline_normalize, get_citation_index, arxiv_published_flag
from nltk import ngrams

In [10]:
df = (
    df
    .pipe(lambda x: x.assign(parsed_title = x.title.apply(pipeline_normalize)))
    .pipe(lambda x: x.assign(parsed_summary = x.summary.apply(pipeline_normalize)))
    .pipe(lambda x: x.assign(parsed_title_2gram = x.parsed_title.apply(lambda x: list(ngrams(x, 2)))))
    .pipe(lambda x: x.assign(parsed_summary_2gram = x.parsed_summary.apply(lambda x: list(ngrams(x, 2)))))
)

In [11]:
df["citation_index"] = df.doi.apply(get_citation_index)

10.5445/KSP/1000151141
10.15480/882.4694
10.1145/3524846.3527341
10.5281/zenodo.6500885
10.13140/RG.2.2.34063.92324


In [12]:
df["published_flag"] = df.apply(lambda x: arxiv_published_flag(x.comment, x.doi), axis=1)

In [13]:
df.primary_category.value_counts().head(14)

cs.LG      537
cs.CV      122
stat.AP     72
stat.ME     63
stat.ML     52
cs.AI       38
cs.RO       27
cs.SI       17
cs.DC       15
cs.NI       14
cs.CL       13
cs.CY       12
cs.CE       12
cs.HC        9
Name: primary_category, dtype: int64

## Popular bigrams

In [14]:
df.parsed_summary_2gram.explode().value_counts().head(20)

(time, seri)         788
(neural, network)    339
(state, art)         322
(machin, learn)      282
(deep, learn)        247
(real, world)        242
(seri, forecast)     205
(forecast, model)    179
(learn, model)       145
(short, term)        134
(spatio, tempor)     127
(paper, propos)      119
(spatial, tempor)    118
(long, term)         116
(propos, novel)       96
(predict, model)      92
(result, show)        90
(data, driven)        84
(extens, experi)      79
(larg, scale)         79
Name: parsed_summary_2gram, dtype: int64

In [15]:
df.parsed_title_2gram.explode().value_counts().head(20)

(time, seri)           185
(seri, forecast)        99
(neural, network)       65
(deep, learn)           54
(machin, learn)         44
(spatio, tempor)        40
(short, term)           29
(traffic, forecast)     27
(multivari, time)       24
(long, term)            23
(spatial, tempor)       22
(forecast, use)         22
(graph, neural)         20
(tempor, graph)         16
(motion, forecast)      16
(load, forecast)        15
(network, traffic)      14
(learn, model)          14
(learn, approach)       14
(transfer, learn)       13
Name: parsed_title_2gram, dtype: int64

In [16]:
(
    df[~df.doi.isnull()]
    .sort_values(by="citation_index", ascending=False)[["title", "citation_index", "published_flag"]]
    .head(20)
)

Unnamed: 0,title,citation_index,published_flag
1734,Electrical Load Forecasting Using Edge Computing and Federated Learning,42.0,10.1109/ICC40277.2020.9148937
335,Wind Power Forecasting Considering Data Privacy Protection: A Federated Deep Reinforcement Learning Approach,8.0,10.1016/j.apenergy.2022.120291
1557,Parallel Spatio-Temporal Attention-Based TCN for Multivariate Time Series Prediction,5.0,10.1007/S00521-021-05958-Z
1593,Systematic review of deep learning and machine learning for building energy,5.0,10.3389/fenrg.2022.786027
698,Advancing the cybersecurity of the healthcare system with self-optimising and self-adaptative artificial intelligence (part 2),4.0,10.1007/s12553-022-00691-6
1804,Forecasting Loss of Signal in Optical Networks with Machine Learning,3.0,10.1364/JOCN.423667
1742,COVID-19 forecasting using new viral variants and vaccination effectiveness models,3.0,10.1016/j.compbiomed.2022.105986
1775,Determination of building flood risk maps from LiDAR mobile mapping data,3.0,10.1016/j.compenvurbsys.2022.101759
1466,Knowledge Graph-Enabled Text-Based Automatic Personality Prediction,3.0,10.1155/2022/3732351
1525,Drivers and challenges of internet of things diffusion in smart stores: A field exploration,3.0,10.1016/j.techfore.2022.121593


In [17]:
(
    df[~df.published_flag.isnull()]
    .pipe(lambda x: x[lambda y: y.parsed_summary_2gram.apply(lambda z: ("time", "seri") in z)])
    .sort_values(by="title")[["title", "citation_index", "published_flag"]]
)

Unnamed: 0,title,citation_index,published_flag
1433,A Deep Learning Approach to Probabilistic Forecasting of Weather,,[submitted]
1521,A Novel Deep Learning Model for Hotel Demand and Revenue Prediction amid COVID-19,0.0,10.24251/HICSS.2022.217
1129,A hybrid-model approach for reducing the performance gap in building energy forecasting,0.0,10.1016/j.aei.2022.101627
123,Agnostic Learning for Packing Machine Stoppage Prediction in Smart Factories,0.0,10.52953/LEDZ3942
210,An Anomaly Detection Method for Satellites Using Monte Carlo Dropout,0.0,10.1109/TAES.2022.3206257
1225,An Edge-Cloud Integrated Framework for Flexible and Dynamic Stream Analytics,0.0,10.1016/j.future.2022.07.023
198,An Extreme-Adaptive Time Series Prediction Model Based on Probability-Enhanced LSTM Neural Networks,,"[accepted, code]"
1147,Are Transformers Effective for Time Series Forecasting?,,[code]
1742,COVID-19 forecasting using new viral variants and vaccination effectiveness models,3.0,10.1016/j.compbiomed.2022.105986
287,Comparison of Uncertainty Quantification with Deep Learning in Time Series Regression,,[neurips]


In [18]:
(
    df[~df.published_flag.isnull()]
    .pipe(lambda x: x[lambda y: y.parsed_summary_2gram.apply(lambda z: ("neural", "network") in z)])
    .sort_values(by="title")[["title", "citation_index", "published_flag"]]
)

Unnamed: 0,title,citation_index,published_flag
856,4G 5G Cell-level Multi-indicator Forecasting based on Dense-MLP,,[journal]
1048,A Spatio-Temporal Neural Network Forecasting Approach for Emulation of Firefront Models,0.0,10.23919/SPA53010.2022.9927888
411,Accurate Extrinsic Prediction of Physical Systems Using Transformers,,[submitted]
210,An Anomaly Detection Method for Satellites Using Monte Carlo Dropout,0.0,10.1109/TAES.2022.3206257
1225,An Edge-Cloud Integrated Framework for Flexible and Dynamic Stream Analytics,0.0,10.1016/j.future.2022.07.023
324,An Efficient FPGA-based Accelerator for Deep Forest,,[conference]
198,An Extreme-Adaptive Time Series Prediction Model Based on Probability-Enhanced LSTM Neural Networks,,"[accepted, code]"
1759,Analyzing Multispectral Satellite Imagery of South American Wildfires Using Deep Learning,1.0,10.1109/ICAPAI55158.2022.9801567
404,Attention-Based Scattering Network for Satellite Imagery,,"[workshop, neurips]"
953,Back to MLP: A Simple Baseline for Human Motion Prediction,,"[accepted, code]"
