In [1]:
!python -m pip install arxiv seaborn nltk

You should consider upgrading via the '/home/marti/Projects/etna-research/.venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import nltk
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/marti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/marti/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
import arxiv
import seaborn as sns
import pandas as pd
pd.set_option('display.max_colwidth', 100)

In [4]:
client = arxiv.Client(
    page_size=500,
    delay_seconds=3,
    num_retries=3
)

In [5]:
results = {
    "forecast": [],
    #"time series": []
}

for tag in results:
    for result in client.results(
        arxiv.Search(
            query=tag,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )
    ):
        if result.published.year < 2021:
            break
        results[tag].append(result)

In [6]:
for tag in results:
    print(f"tag: {tag} {len(results[tag])}")

tag: forecast 1763


In [7]:
df = pd.DataFrame([i.__dict__ for tag in results for i in results[tag]])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1763 entries, 0 to 1762
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   entry_id          1763 non-null   object             
 1   updated           1763 non-null   datetime64[ns, UTC]
 2   published         1763 non-null   datetime64[ns, UTC]
 3   title             1763 non-null   object             
 4   authors           1763 non-null   object             
 5   summary           1763 non-null   object             
 6   comment           1021 non-null   object             
 7   journal_ref       176 non-null    object             
 8   doi               317 non-null    object             
 9   primary_category  1763 non-null   object             
 10  categories        1763 non-null   object             
 11  links             1763 non-null   object             
 12  pdf_url           1763 non-null   object             
 13  _ra

In [9]:
df = df[df.primary_category.str.contains("(cs\.|stat\.)", case=False)]
df = df[~df.primary_category.str.contains("physics", case=False)]
df = df.drop_duplicates(subset="title", keep="last")

  return func(self, *args, **kwargs)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1002 entries, 2 to 1761
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   entry_id          1002 non-null   object             
 1   updated           1002 non-null   datetime64[ns, UTC]
 2   published         1002 non-null   datetime64[ns, UTC]
 3   title             1002 non-null   object             
 4   authors           1002 non-null   object             
 5   summary           1002 non-null   object             
 6   comment           526 non-null    object             
 7   journal_ref       79 non-null     object             
 8   doi               105 non-null    object             
 9   primary_category  1002 non-null   object             
 10  categories        1002 non-null   object             
 11  links             1002 non-null   object             
 12  pdf_url           1002 non-null   object             
 13  _ra

In [11]:
from nltk import word_tokenize
from nltk import ngrams
from nltk import stem
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w{3,}')
stop_words = set(stopwords.words('english'))
stemmer = stem.SnowballStemmer(language="english")

In [12]:
import requests

def get_citation_index(x: str):
    """Get citation counter from crossref api."""
    if x is not None and len(x) > 4:
        try:
            data = requests.get(f"https://api.crossref.org/works/{x}").json()
            return data["message"]["is-referenced-by-count"]
        except:
            print(x)

def published_flag(comment_message: str, doi: str):
    """Find out papers published or conference accepted."""
    key_words = {
        "workshop", "journal", "accepted",
        "code", "conference", "publication",
        "neurips", "submitted", "icml",
        "ieee"
    }
    if doi:
        return True
    if comment_message is None:
        return False
    if len(key_words.intersection(set(tokenizer.tokenize(comment_message.lower())))) > 0:
        return True
    else: return False

In [13]:
from typing import List

def pipeline_normalize(x: str) -> List[str]:
    global stemmer
    global tokenizer
    global stop_words
    if len(x) < 1:
        return []
    _list_of_words = list()
    for i in tokenizer.tokenize(x):
        word = stemmer.stem(i).lower()
        if word in stop_words:
            continue
        else:
            _list_of_words.append(word)
    return _list_of_words

In [14]:
df = (
    df
    .pipe(lambda x: x.assign(parsed_title = x.title.apply(pipeline_normalize)))
    .pipe(lambda x: x.assign(parsed_summary = x.summary.apply(pipeline_normalize)))
    .pipe(lambda x: x.assign(parsed_title_2gram = x.parsed_title.apply(lambda x: list(ngrams(x, 2)))))
    .pipe(lambda x: x.assign(parsed_summary_2gram = x.parsed_summary.apply(lambda x: list(ngrams(x, 2)))))
)

In [15]:
df["citation_index"] = df.doi.apply(get_citation_index)

10.1145/3488560.3498511
10.1145/3492323.3495588
10.5445/KSP/1000138532
10.1080/21645515.2021.2017216
10.1285/i20705948v14n1p230
10.1109/ICASSP39728.2021.9414118.
10.5281/zenodo.4478251
10.37421/jhmi.2020.11.342 10.37421/jhmi.2020.11.342


In [16]:
df["published_flag"] = df.apply(lambda x: published_flag(x.comment, x.doi), axis=1)

In [17]:
df.primary_category.value_counts().head(14)

cs.LG      488
stat.AP     96
cs.CV       89
stat.ME     78
stat.ML     48
cs.AI       36
cs.SI       24
cs.RO       23
cs.CL       21
cs.CY       18
cs.NI       12
cs.DL        9
cs.DC        9
cs.NE        8
Name: primary_category, dtype: int64

In [18]:
df.parsed_summary_2gram.explode().value_counts().head(20)

(time, seri)         796
(neural, network)    408
(state, art)         259
(machin, learn)      251
(deep, learn)        241
(real, world)        189
(seri, forecast)     169
(short, term)        160
(forecast, model)    158
(learn, model)       141
(spatio, tempor)     108
(paper, propos)      107
(spatial, tempor)     90
(long, term)          89
(real, time)          88
(result, show)        86
(data, driven)        85
(long, short)         85
(recurr, neural)      84
(term, memori)        82
Name: parsed_summary_2gram, dtype: int64

In [19]:
df.parsed_title_2gram.explode().value_counts().head(20)

(time, seri)              175
(neural, network)          84
(seri, forecast)           77
(deep, learn)              51
(machin, learn)            50
(spatio, tempor)           35
(short, term)              34
(forecast, use)            27
(graph, neural)            25
(traffic, forecast)        22
(data, driven)             18
(graph, convolut)          17
(learn, model)             17
(load, forecast)           15
(trajectori, predict)      15
(convolut, network)        15
(recurr, neural)           15
(network, traffic)         14
(trajectori, forecast)     14
(multivari, time)          13
Name: parsed_title_2gram, dtype: int64

In [20]:
df[~df.doi.isnull()].sort_values(by="citation_index", ascending=False)[["title", "citation_index"]].head(20)

Unnamed: 0,title,citation_index
1149,A better measure of relative prediction accuracy for model selection and model estimation,178.0
1145,The Semantic Brand Score,33.0
1111,Forecasting managerial turnover through e-mail based social network analysis,32.0
1381,An Experimental Review on Deep Learning Architectures for Time Series Forecasting,21.0
552,Weak Signals in the Mobility Landscape: Car Sharing in Ten European Cities,11.0
1714,Learning to Anticipate Egocentric Actions by Imagination,8.0
1146,Forecasting election results by studying brand importance in online news,8.0
1455,The RECIPE Approach to Challenges in Deeply Heterogeneous High Performance Systems,8.0
1627,Gesture Recognition in Robotic Surgery: a Review,7.0
1098,Look inside. Predicting stock prices by analysing an enterprise intranet social network and usin...,5.0


In [21]:
(
    df[df.published_flag]
    .pipe(lambda x: x[lambda y: y.parsed_summary_2gram.apply(lambda z: ("time", "seri") in z)])
    .title.sort_values().to_list()
)

['A Comparative Analysis of Machine Learning and Grey Models',
 'A Comparative Study of Using Spatial-Temporal Graph Convolutional Networks for Predicting Availability in Bike Sharing Schemes',
 'A Daily Tourism Demand Prediction Framework Based on Multi-head Attention CNN: The Case of The Foreign Entrant in South Korea',
 'A Robust and Efficient Multi-Scale Seasonal-Trend Decomposition',
 'A Spatio-Temporal Model for Predicting Wind Speeds in Southern California',
 'A Study of Joint Graph Inference and Forecasting',
 'A Trainable Reconciliation Method for Hierarchical Time-Series',
 'A study on Ensemble Learning for Time Series Forecasting and the need for Meta-Learning',
 'A systematic review of Python packages for time series analysis',
 'AGSTN: Learning Attention-adjusted Graph Spatio-Temporal Networks for Short-term Urban Sensor Value Forecasting',
 'AdaRNN: Adaptive Learning and Forecasting of Time Series',
 'Adversarial autoencoders and adversarial LSTM for improved forecasts of

In [22]:
(
    df[df.published_flag]
    .pipe(lambda x: x[lambda y: y.parsed_summary_2gram.apply(lambda z: ("neural", "network") in z)])
    .title.to_list()
)

['Artificial Intelligence and Statistical Techniques in Short-Term Load Forecasting: A Review',
 'GOPHER: Categorical probabilistic forecasting with graph structure via local continuous-time dynamics',
 'Graph Neural Controlled Differential Equations for Traffic Forecasting',
 'Probabilistic Deep Learning to Quantify Uncertainty in Air Quality Forecasting',
 'A Daily Tourism Demand Prediction Framework Based on Multi-head Attention CNN: The Case of The Foreign Entrant in South Korea',
 'Machine Learning-Based Soft Sensors for Vacuum Distillation Unit',
 'Smart Data Representations: Impact on the Accuracy of Deep Neural Networks',
 'A Comparative Study on Basic Elements of Deep Learning Models for Spatial-Temporal Traffic Forecasting',
 'Forecasting Crude Oil Price Using Event Extraction',
 'Observation Error Covariance Specification in Dynamical Systems for Data assimilation using Recurrent Neural Networks',
 'Predictive Auto-scaling with OpenStack Monasca',
 'Spatiotemporal Weather Da