In [1]:
import requests
from urllib.parse import urlencode

import pandas as pd
from tqdm import tqdm
import json

BASE_API_URL = 'https://api.gotriple.eu/'

In [2]:
def build_api_url(query):
    params = {
    'q': f'{query}',
    'fq': {
        'type': 'typ_article',
        # 'has_pdf': 'true',
        # 'in_language': 'en'
    },
    'include_duplicates': 'false',
    'aggs': {},
    'sort': 'name:desc', # name, publication_date, most_recent --> name:desc
    'page': 1,
    'size': 100, # max 100
    }

    cleaned_params = {}
    for key, value in params.items():
        if value:
            if isinstance(value, dict):
                cleaned_params[key] = urlencode(value).replace('&', ';')
            else:
               cleaned_params[key] = value
    return urlencode(cleaned_params)

In [3]:
queries = [
    # Society, Culture, and Identity
    '"gender studies" OR "queer theory" OR "sexuality" OR "intersectionality"',
    '"decolonization" OR "postcolonial studies" OR "race and ethnicity" OR "decolonial theory"',
    '"migration" OR "diaspora" OR "refugees" OR "transnationalism"',
    '"cultural heritage" OR "collective memory" OR "heritage restitution" OR "memory politics"',

    # Global Challenges and Social Change
    '"climate change" OR "sustainability" OR "climate justice" OR "environmental humanities"',
    '"inequality" OR "social justice" OR "poverty" OR "social policy"',
    '"urbanization" OR "smart cities" OR "urban justice" OR "housing policy"',
    '"democracy" OR "global governance" OR "authoritarianism" OR "human rights"',

    # Digital Transformation and Technology
    '"digital humanities" OR "text mining" OR "data visualization" OR "digital archives"',
    '"artificial intelligence" OR "AI ethics" OR "algorithmic bias" OR "human-machine interaction"',
    '"social media" OR "misinformation" OR "digital communication" OR "public sphere"',
    '"cyberculture" OR "virtual communities" OR "online identity" OR "digital creativity"',

    # Economy, Work, and Policy
    '"political economy" OR "globalization" OR "neoliberalism" OR "financial crisis"',
    '"automation" OR "future of work" OR "platform economy" OR "gig work"',
    '"public policy" OR "governance" OR "institutional trust" OR "behavioral economics"',
    '"digital education" OR "critical pedagogy" OR "educational equity"',

    # History, Philosophy, and Thought
    '"global history" OR "transnational history" OR "empire" OR "knowledge exchange"',
    '"philosophy of technology" OR "AI ethics" OR "posthumanism" OR "moral responsibility"',
    '"critical theory" OR "power and discourse" OR "knowledge politics" OR "epistemology"',
    '"history of science" OR "biopolitics" OR "pandemics" OR "medical humanities"',

    # Interdisciplinary and Emerging Areas
    '"health humanities" OR "medicine and literature" OR "bioethics"',
    '"environmental sociology" OR "human-nature relations" OR "resilience"',
    '"posthumanism" OR "anthropocene" OR "planetary ethics"',
    '"indigenous knowledge" OR "traditional ecological knowledge" OR "epistemic justice"'
]

In [4]:
# hydra:totalItems
queries_response_len = []
endpoint = 'documents'
for query in tqdm(queries):
    url = f'{BASE_API_URL}{endpoint}?{build_api_url(query)}'
    response = requests.get(url)
    if response.ok:
        queries_response_len.append((query, response.json()['hydra:totalItems']))

100%|██████████| 24/24 [00:53<00:00,  2.23s/it]


In [5]:
out_df = pd.DataFrame(queries_response_len, columns=['query', 'response length'])
out_df.to_excel('queries_len.xlsx', index=False)

In [None]:
endpoint = 'documents'
for idx, query in tqdm(enumerate(queries)):
    if idx < 5: continue
    url = f'{BASE_API_URL}{endpoint}?{build_api_url(query)}'
    records = []
    while True:
        response = requests.get(url)
        if response.ok:
            records.extend(response.json()['hydra:member'])
            # print(url)
        if not response.json().get('hydra:view', {}).get('hydra:next'):
            break
        else:
            url = BASE_API_URL + response.json()['hydra:view']['hydra:next'][1:]
    print(len(records))
    with open(f'gotriple_response_{idx}.json', 'w', encoding='utf-8') as txt:
        json.dump(records, txt, indent=4, ensure_ascii=False)
    

0it [00:00, ?it/s]