In [None]:
import os
import pandas as pd
import requests


from dotenv import load_dotenv
from io import StringIO
from math import ceil
from tqdm import tqdm

from pybliometrics.scival import PublicationLookup, init
from utils import get_content, parse_scival_info, split_lines

load_dotenv()
init()

# SET COOKIE in .env file
COOKIE = os.getenv('COOKIE')

In [None]:
# Get topic ID for pybliometrics paper
pl = PublicationLookup('85068268027')
topic_id = pl.topic_id

In [None]:
# Get first page content
res_text = get_content(topic_id, cookie=COOKIE, page=1)

# Split the response into intro and table data
intro_lines, table_text = split_lines(res_text)

info = parse_scival_info(intro_lines)
n = info['total_publications']
retrieved = info['publications_retrieved']
num_pages = ceil(n / retrieved) - 1  # Subtract 1 since we already have page 1
start = 2

print(f"Data for topic ID {topic_id}: {info["data_set"]}")
print(f"Year range: {info["start_year"]} - {info["end_year"]}")
print(f"Total publications: {n}")
print(f"Retrieved per page: {retrieved}")

In [None]:
# Initialize list to store all table data
all_table_lines = table_text.splitlines()[:-2]  # Exclude last two summary lines

# Fetch remaining pages if there are more than 1 page
if num_pages > 1:
    for page in tqdm(range(start, start + num_pages), desc="Fetching pages"):
        text = get_content(topic_id, cookie=COOKIE, page=page)
        
        intro_lines, page_table_text = split_lines(text)
        table_lines = page_table_text.splitlines()[1:-2] # Exclude header and last two summary lines

        all_table_lines.extend(table_lines)

In [None]:
# Create DataFrame from complete dataset
df = pd.read_csv(StringIO("\n".join(all_table_lines)))
assert df.shape[0] == n, f"Expected {n} rows, got {df.shape[0]}"

In [None]:
df