In [19]:
concepts = {
    "Time Series": "https://api.openalex.org/works?filter=concepts.id:C151406439",
    "Probabilistic forecasting": "https://api.openalex.org/works?filter=concepts.id:C122282355",
    "Demand forecasting": "https://api.openalex.org/works?filter=concepts.id:C193809577"
}
unparsed_works_ids = {
    "Time Series": "C151406439",
    "Probabilistic forecasting": "C122282355",
    "Demand forecasting": "C193809577"
}
unparsed_works_dict = {
    "Time Series": [],
    "Probabilistic forecasting": [],
    "Demand forecasting": []
}

In [20]:
import requests

import seaborn as sns
import pandas as pd
pd.set_option('display.max_colwidth', 300)

In [21]:
CONCEPT_ID = "C122282355"
YEAR = 2022
cited_by_count = 10

In [22]:
TEMPLATE = "https://api.openalex.org/works?filter=concepts.id:{concept_id},publication_year:{publication_year},cited_by_count:>{cited_by_count}&per-page=50&page={page_id}"

In [23]:
responce = requests.get(
    TEMPLATE.format(
        concept_id=CONCEPT_ID,
        publication_year=YEAR,
        cited_by_count=cited_by_count,
        page_id=1
    )
)

In [24]:
from typing import Any, List, Optional

from pydantic import BaseModel

class Author(BaseModel):
    id: str
    display_name: str
    orcid: Optional[str]

class Institution(BaseModel):
    id: Optional[str]
    display_name: Optional[str]
    country_code: Optional[str]

class Authorship(BaseModel):
    author: Author
    institutions: List[Institution]
    author_position: str

class Concept(BaseModel):
    id: str
    display_name: str
    score: Optional[float]
    level: int
    wikidata: str

class Work(BaseModel):
    id: str
    display_name: str
    publication_date: str
    relevance_score: Optional[float]
    authorships: List[Authorship]
    concepts: List[Concept]
    cited_by_count: int
    publication_year: int
    # cited_by_api_url: List[str]
    doi: Optional[str]
    referenced_works: List[str]
    related_works: List[str]
    abstract_inverted_index: Optional[dict]

In [25]:
for concept_id in unparsed_works_ids:
    page_id = 1
    while True:
        template = TEMPLATE.format(concept_id=unparsed_works_ids[concept_id], publication_year=YEAR, cited_by_count=cited_by_count, page_id=page_id)
        print(template)
        responce = requests.get(template).json()
        unparsed_works_dict[concept_id] += responce["results"]
        print(len(responce["results"]))
        if len(responce["results"]) == 0:
            break
        else:
            page_id += 1

https://api.openalex.org/works?filter=concepts.id:C151406439,publication_year:2022,cited_by_count:>10&per-page=50&page=1
34
https://api.openalex.org/works?filter=concepts.id:C151406439,publication_year:2022,cited_by_count:>10&per-page=50&page=2
0
https://api.openalex.org/works?filter=concepts.id:C122282355,publication_year:2022,cited_by_count:>10&per-page=50&page=1
12
https://api.openalex.org/works?filter=concepts.id:C122282355,publication_year:2022,cited_by_count:>10&per-page=50&page=2
0
https://api.openalex.org/works?filter=concepts.id:C193809577,publication_year:2022,cited_by_count:>10&per-page=50&page=1
2
https://api.openalex.org/works?filter=concepts.id:C193809577,publication_year:2022,cited_by_count:>10&per-page=50&page=2
0


In [26]:
parsed_works = dict()
for concept_id in unparsed_works_dict:
    parsed_works[concept_id] = [Work(**i) for i in unparsed_works_dict[concept_id]]

In [27]:
for i in parsed_works:
    print(f"{i}: {len(unparsed_works_dict[i])}")

Time Series: 34
Probabilistic forecasting: 12
Demand forecasting: 2


In [28]:
parsed_works_with_index = dict()

In [29]:
import tqdm

In [30]:
for concept_id in parsed_works:
    for work in tqdm.tqdm(parsed_works[concept_id]):
        if work.id not in parsed_works_with_index:
            url_api = work.id[:8] + "api." + work.id[8:]
            parsed_works_with_index[work.id] = Work(**requests.get(url_api).json())

100%|██████████| 34/34 [00:20<00:00,  1.62it/s]
100%|██████████| 12/12 [00:06<00:00,  1.95it/s]
100%|██████████| 2/2 [00:01<00:00,  1.50it/s]


In [31]:
df = pd.DataFrame([val.dict() for i, val in parsed_works_with_index.items()])

In [32]:
df.head()

Unnamed: 0,id,display_name,publication_date,relevance_score,authorships,concepts,cited_by_count,publication_year,doi,referenced_works,related_works,abstract_inverted_index
0,https://openalex.org/W4206392762,Fractional-Order Discrete-Time SIR Epidemic Model with Vaccination: Chaos and Complexity,2022-01-06,,"[{'author': {'id': 'https://openalex.org/A4208827294', 'display_name': 'Zai-Yin He', 'orcid': None}, 'institutions': [{'id': 'https://openalex.org/I16609230', 'display_name': 'Hunan University', 'country_code': 'CN'}], 'author_position': 'first'}, {'author': {'id': 'https://openalex.org/A4208827...","[{'id': 'https://openalex.org/C191544260', 'display_name': 'Lyapunov exponent', 'score': 0.75074357, 'level': 3, 'wikidata': 'https://www.wikidata.org/wiki/Q1238630'}, {'id': 'https://openalex.org/C33923547', 'display_name': 'Mathematics', 'score': 0.62190425, 'level': 0, 'wikidata': 'https://ww...",100,2022,https://doi.org/10.3390/math10020165,"[https://openalex.org/W1994320166, https://openalex.org/W2003384337, https://openalex.org/W2014208786, https://openalex.org/W2031064583, https://openalex.org/W2035495750, https://openalex.org/W2043547535, https://openalex.org/W2063889362, https://openalex.org/W2077204677, https://openalex.org/W2...","[https://openalex.org/W646915967, https://openalex.org/W1975119414, https://openalex.org/W1984850264, https://openalex.org/W1990987436, https://openalex.org/W2039252722, https://openalex.org/W2084611103, https://openalex.org/W2378563815, https://openalex.org/W3014947789, https://openalex.org/W42...","{'This': [0], 'research': [1], 'presents': [2], 'a': [3], 'new': [4], 'fractional-order': [5], 'discrete-time': [6], 'susceptible-infected-recovered': [7], '(SIR)': [8], 'epidemic': [9, 48], 'model': [10, 19, 49, 64], 'with': [11, 51], 'vaccination.': [12], 'The': [13, 61, 78], 'dynamical': [14]..."
1,https://openalex.org/W3198255590,Underestimated impact of the COVID-19 on carbon emission reduction in developing countries – A novel assessment based on scenario analysis,2022-03-01,,"[{'author': {'id': 'https://openalex.org/A2890808284', 'display_name': 'Qiang Wang', 'orcid': None}, 'institutions': [{'id': 'https://openalex.org/I204553293', 'display_name': 'China University of Petroleum, Beijing', 'country_code': 'CN'}], 'author_position': 'first'}, {'author': {'id': 'https:...","[{'id': 'https://openalex.org/C24338571', 'display_name': 'Autoregressive integrated moving average', 'score': 0.8217881, 'level': 3, 'wikidata': 'https://www.wikidata.org/wiki/Q2566298'}, {'id': 'https://openalex.org/C47737302', 'display_name': 'Greenhouse gas', 'score': 0.7026352, 'level': 2, ...",71,2022,https://doi.org/10.1016/j.envres.2021.111990,"[https://openalex.org/W1720804347, https://openalex.org/W1963528266, https://openalex.org/W1979373126, https://openalex.org/W1983184804, https://openalex.org/W1984051156, https://openalex.org/W2020896637, https://openalex.org/W2021826571, https://openalex.org/W2073004501, https://openalex.org/W2...","[https://openalex.org/W1657138622, https://openalex.org/W2332779710, https://openalex.org/W2619056591, https://openalex.org/W2756100189, https://openalex.org/W2793024279, https://openalex.org/W2918851169, https://openalex.org/W3017456571, https://openalex.org/W3110063481, https://openalex.org/W3...","{'Existing': [0], 'studies': [1], 'on': [2, 9, 15, 35, 55, 181, 207], 'the': [3, 6, 30, 33, 40, 43, 48, 87, 97, 105, 124, 130, 133, 142, 146, 161, 170, 176, 179, 202, 205], 'impact': [4, 31, 177, 203], 'of': [5, 19, 32, 90, 104, 119, 137, 149, 165, 178, 201, 204], 'COVID-19': [7], 'pandemic': [8..."
2,https://openalex.org/W4213138287,PFVAE: A Planar Flow-Based Variational Auto-Encoder Prediction Model for Time Series Data,2022-02-16,,"[{'author': {'id': 'https://openalex.org/A2096258791', 'display_name': 'Xue-Bo Jin', 'orcid': 'https://orcid.org/0000-0002-2230-0077'}, 'institutions': [{'id': 'https://openalex.org/I179026463', 'display_name': 'Beijing Technology and Business University', 'country_code': 'CN'}], 'author_positio...","[{'id': 'https://openalex.org/C22019652', 'display_name': 'Overfitting', 'score': 0.81038165, 'level': 3, 'wikidata': 'https://www.wikidata.org/wiki/Q331309'}, {'id': 'https://openalex.org/C41008148', 'display_name': 'Computer science', 'score': 0.69117624, 'level': 0, 'wikidata': 'https://www.w...",56,2022,https://doi.org/10.3390/math10040610,"[https://openalex.org/W1993648480, https://openalex.org/W2010954562, https://openalex.org/W2039569176, https://openalex.org/W2042297527, https://openalex.org/W2106297219, https://openalex.org/W2110242546, https://openalex.org/W2116396438, https://openalex.org/W2136291495, https://openalex.org/W2...","[https://openalex.org/W1632690555, https://openalex.org/W2170984326, https://openalex.org/W2242271381, https://openalex.org/W2357809648, https://openalex.org/W2378555542, https://openalex.org/W2381421930, https://openalex.org/W2393330879, https://openalex.org/W2498331889, https://openalex.org/W2...","{'Prediction': [0], 'based': [1], 'on': [2], 'time': [3, 20, 70, 101, 135], 'series': [4, 21, 71, 102, 136], 'has': [5], 'a': [6, 41, 69], 'wide': [7], 'range': [8], 'of': [9, 19, 25, 84, 100, 109], 'applications.': [10], 'Due': [11], 'to': [12, 74, 94, 122], 'the': [13, 23, 33, 52, 60, 64, 76, ..."
3,https://openalex.org/W4214658714,A Variational Bayesian Deep Network with Data Self-Screening Layer for Massive Time-Series Data Forecasting,2022-02-25,,"[{'author': {'id': 'https://openalex.org/A2096258791', 'display_name': 'Xue-Bo Jin', 'orcid': 'https://orcid.org/0000-0002-2230-0077'}, 'institutions': [{'id': 'https://openalex.org/I179026463', 'display_name': 'Beijing Technology and Business University', 'country_code': 'CN'}], 'author_positio...","[{'id': 'https://openalex.org/C41008148', 'display_name': 'Computer science', 'score': 0.71171415, 'level': 0, 'wikidata': 'https://www.wikidata.org/wiki/Q21198'}, {'id': 'https://openalex.org/C124101348', 'display_name': 'Data mining', 'score': 0.6646417, 'level': 1, 'wikidata': 'https://www.wi...",48,2022,https://doi.org/10.3390/e24030335,"[https://openalex.org/W1992970622, https://openalex.org/W1993648480, https://openalex.org/W2010954562, https://openalex.org/W2039569176, https://openalex.org/W2042297527, https://openalex.org/W2061232022, https://openalex.org/W2106297219, https://openalex.org/W2110242546, https://openalex.org/W2...","[https://openalex.org/W2058204059, https://openalex.org/W2471828438, https://openalex.org/W2907177309, https://openalex.org/W2909443907, https://openalex.org/W2994560360, https://openalex.org/W3114881531, https://openalex.org/W3175573522, https://openalex.org/W3203688742, https://openalex.org/W4...","{'Compared': [0], 'with': [1, 83, 96], 'mechanism-based': [2], 'modeling': [3, 6], 'methods,': [4], 'data-driven': [5], 'based': [7], 'on': [8, 60], 'big': [9, 52], 'data': [10, 33, 73, 79, 95, 127], 'has': [11], 'become': [12], 'a': [13, 36, 66, 78, 84, 103, 131], 'popular': [14], 'research': [..."
4,https://openalex.org/W4229365374,Short-Term Wind Power Prediction via Spatial Temporal Analysis and Deep Residual Networks,2022-05-09,,"[{'author': {'id': 'https://openalex.org/A4229428196', 'display_name': 'Huajin Li', 'orcid': None}, 'institutions': [{'id': 'https://openalex.org/I4210125143', 'display_name': 'Chengdu University', 'country_code': 'CN'}], 'author_position': 'first'}]","[{'id': 'https://openalex.org/C78600449', 'display_name': 'Wind power', 'score': 0.8466325, 'level': 2, 'wikidata': 'https://www.wikidata.org/wiki/Q43302'}, {'id': 'https://openalex.org/C155512373', 'display_name': 'Residual', 'score': 0.6728252, 'level': 2, 'wikidata': 'https://www.wikidata.org...",29,2022,https://doi.org/10.3389/fenrg.2022.920407,"[https://openalex.org/W1634828275, https://openalex.org/W1672514028, https://openalex.org/W1981047607, https://openalex.org/W1984061847, https://openalex.org/W2044118423, https://openalex.org/W2116341502, https://openalex.org/W2191329106, https://openalex.org/W2285130932, https://openalex.org/W2...","[https://openalex.org/W1482055478, https://openalex.org/W1998469123, https://openalex.org/W2026619973, https://openalex.org/W2037194002, https://openalex.org/W2056521106, https://openalex.org/W2119244117, https://openalex.org/W2149860591, https://openalex.org/W2370333049, https://openalex.org/W2...","{'Wind': [0], 'power': [1, 15, 30, 43, 98, 113], 'is': [2, 16], 'a': [3, 27, 58, 90], 'rapidly': [4], 'growing': [5], 'source': [6], 'of': [7, 13, 158], 'clean': [8], 'energy.': [9], 'Accurate': [10], 'short-term': [11, 96, 163], 'forecasting': [12, 31, 38], 'wind': [14, 29, 42, 97, 102, 106, 11..."


In [33]:
def index_to_abstract(index):
    if index is None:
        return
    max_len = -1
    for i, val in index.items():
        max_len = max(max_len, max(val))
    
    abstract_list = ["" for i in range(max_len+1)]
    for i, val in index.items():
        for j in val:
            abstract_list[j] = i
    
    return " ".join(abstract_list).lower()

In [34]:
df["summary"] = df.abstract_inverted_index.apply(index_to_abstract)
df["title"] = df["display_name"]

## Top 20 papers in Time Series in 2021 year

In [35]:
df.sort_values(by="cited_by_count", ascending=False)[["title", "cited_by_count"]].head(20)

Unnamed: 0,title,cited_by_count
0,Fractional-Order Discrete-Time SIR Epidemic Model with Vaccination: Chaos and Complexity,100
1,Underestimated impact of the COVID-19 on carbon emission reduction in developing countries – A novel assessment based on scenario analysis,71
2,PFVAE: A Planar Flow-Based Variational Auto-Encoder Prediction Model for Time Series Data,56
3,A Variational Bayesian Deep Network with Data Self-Screening Layer for Massive Time-Series Data Forecasting,48
4,Short-Term Wind Power Prediction via Spatial Temporal Analysis and Deep Residual Networks,29
5,A novel hybrid model based on nonlinear weighted combination for short-term wind power forecasting,28
6,Evaluation of urban bus service reliability on variable time horizons using a hybrid deep learning method,26
34,Sparse Gaussian process regression for multi-step ahead forecasting of wind gusts combining numerical weather predictions and on-site measurements,23
35,Short-term load forecasting based on LSTM networks considering attention mechanism,22
7,Multi-step wind speed forecasting and Hurst analysis using novel hybrid secondary decomposition approach,22


In [36]:
from utils import pipeline_normalize
from nltk import ngrams

In [37]:
df = (
    df
    .pipe(lambda x: x.assign(parsed_title = x.title.apply(pipeline_normalize)))
    .pipe(lambda x: x.assign(parsed_summary = x.summary.apply(pipeline_normalize)))
    .pipe(lambda x: x.assign(parsed_title_2gram = x.parsed_title.apply(lambda x: list(ngrams(x, 2)))))
    .pipe(lambda x: x.assign(parsed_summary_2gram = x.parsed_summary.apply(lambda x: list(ngrams(x, 2)))))
)

In [38]:
df.parsed_summary_2gram.explode().value_counts().head(20)

(time, seri)               45
(short, term)              36
(wind, power)              25
(neural, network)          25
(deep, learn)              23
(forecast, model)          21
(load, forecast)           20
(power, forecast)          20
(long, short)              20
(term, memori)             19
(probabilist, forecast)    18
(machin, learn)            18
(wind, speed)              18
(lstm, model)              17
(long, term)               17
(predict, model)           15
(learn, model)             14
(hybrid, model)            14
(power, system)            14
(predict, accuraci)        14
Name: parsed_summary_2gram, dtype: int64

In [39]:
df.parsed_title_2gram.explode().value_counts().head(20)

(time, seri)         14
(short, term)        10
(power, forecast)     6
(machin, learn)       5
(long, short)         5
(neural, network)     4
(wind, power)         4
(term, memori)        4
(model, base)         3
(term, wind)          3
(novel, hybrid)       3
(hybrid, model)       3
(wind, speed)         3
(multi, step)         3
(solar, power)        3
(gate, recurr)        3
(seri, analysi)       3
(seri, forecast)      3
(seri, data)          3
(load, forecast)      3
Name: parsed_title_2gram, dtype: int64

## Top 20 papers in Time Series in 2022 year with `neural-network` bigram

In [40]:
(
    df
    .pipe(lambda x: x[lambda y: y.parsed_summary_2gram.apply(lambda z: ("neural", "network") in z)])
    .sort_values(by="cited_by_count", ascending=False)[["title", "cited_by_count"]]
)

Unnamed: 0,title,cited_by_count
1,Underestimated impact of the COVID-19 on carbon emission reduction in developing countries – A novel assessment based on scenario analysis,71
4,Short-Term Wind Power Prediction via Spatial Temporal Analysis and Deep Residual Networks,29
5,A novel hybrid model based on nonlinear weighted combination for short-term wind power forecasting,28
6,Evaluation of urban bus service reliability on variable time horizons using a hybrid deep learning method,26
7,Multi-step wind speed forecasting and Hurst analysis using novel hybrid secondary decomposition approach,22
10,Time series predicting of COVID-19 based on deep learning,20
12,Bayesian optimization based dynamic ensemble for time series forecasting,19
15,Convolutional neural network fault classification based on time-series analysis for benchmark wind turbine machine,17
17,"Comparative analysis of Gated Recurrent Units (GRU), long Short-Term memory (LSTM) cells, autoregressive Integrated moving average (ARIMA), seasonal autoregressive Integrated moving average (SARIMA) for forecasting COVID-19 trends",16
18,"A Hybrid Model for Water Quality Prediction Based on an Artificial Neural Network, Wavelet Transform, and Long Short-Term Memory",16
