In [None]:
import os
import pandas as pd
import requests

from dotenv import load_dotenv
from io import StringIO
from math import ceil
from tqdm import tqdm

from pybliometrics.scival import PublicationLookup, init
from utils import get_content, parse_scival_info, split_lines

load_dotenv()
init()

# SET COOKIE in .env file
COOKIE = os.getenv('COOKIE')

In [9]:
# Get topic ID for pybliometrics paper
pl = PublicationLookup('85068268027')
topic_id = pl.topic_id

In [10]:
# Get first page content
res_text = get_content(topic_id, cookie=COOKIE, page=1)

# Split the response into intro and table data
intro_lines, table_text = split_lines(res_text)

info = parse_scival_info(intro_lines)
n = info['total_publications']
retrieved = info['publications_retrieved']
num_pages = ceil(n / retrieved) - 1  # Subtract 1 since we already have page 1
start = 2

print(f"Data for topic ID {topic_id}: {info["data_set"]}")
print(f"Year range: {info["start_year"]} - {info["end_year"]}")
print(f"Total publications: {n}")
print(f"Retrieved per page: {retrieved}")

Data for topic ID 192: Publications in Impact of Citation Metrics on Scientific Evaluation T.192
Year range: 1996 - 2025
Total publications: 8906
Retrieved per page: 1000


In [11]:
# Initialize list to store all table data
all_table_lines = table_text.splitlines()[:-2]  # Exclude last two summary lines

# Fetch remaining pages if there are more than 1 page
if num_pages > 1:
    for page in tqdm(range(start, start + num_pages), desc="Fetching pages"):
        text = get_content(topic_id, cookie=COOKIE, page=page)
        
        intro_lines, page_table_text = split_lines(text)
        table_lines = page_table_text.splitlines()[1:-2] # Exclude header and last two summary lines

        all_table_lines.extend(table_lines)

Fetching pages: 100%|██████████| 8/8 [00:51<00:00,  6.39s/it]


In [12]:
# Create DataFrame from complete dataset
df = pd.read_csv(StringIO("\n".join(all_table_lines)))
assert df.shape[0] == n, f"Expected {n} rows, got {df.shape[0]}"

In [14]:
df.head(5)

Unnamed: 0,Title,Authors,Year,Scopus Source title,Language,Citations,Field-Weighted Citation Impact,Reference,Abstract,Publication type,Open Access,EID,Institutions,Number of Institutions
0,An index to quantify an individual's scientifi...,"Hirsch, J.E.",2005,Proceedings of the National Academy of Science...,English,8964,13.42,"Hirsch, J.E. (2005).An index to quantify an in...",https://www.scopus.com/record/display.url?eid=...,Article,Green,2-s2.0-28044445101,University of California at San Diego,1
1,"Comparison of PubMed, Scopus, Web of Science, ...","Falagas, M.E.| Pitsouni, E.I.| Malietzis, G.A....",2008,FASEB Journal,English,3581,9.02,"Falagas, M.E., Pitsouni, E.I., Malietzis, G.A....",https://www.scopus.com/record/display.url?eid=...,Article,,2-s2.0-38949137710,Alfa Institute of Biomedical Sciences| Tufts U...,2
2,The journal coverage of Web of Science and Sco...,"Mongeon, P.| Paul-Hus, A.",2016,Scientometrics,English,3161,65.67,"Mongeon, P., Paul-Hus, A. (2016).The journal c...",https://www.scopus.com/record/display.url?eid=...,Article,,2-s2.0-84954384742,University of Montreal,1
3,The history and meaning of the journal impact ...,"Garfield, E.",2006,JAMA,English,2046,114.87,"Garfield, E. (2006).The history and meaning of...",https://www.scopus.com/record/display.url?eid=...,Note,,2-s2.0-29944438252,-,0
4,Why the impact factor of journals should not b...,"Seglen, P.O.",1997,British Medical Journal,English,1972,18.87,"Seglen, P.O. (1997).Why the impact factor of j...",https://www.scopus.com/record/display.url?eid=...,Article,,2-s2.0-0031049280,-,0
