# Thiết lập config

In [9]:
import pyalex
from pyalex import config
from itertools import chain
import json
import os
import time
pyalex.config.email = "thuattruongminh@gmail.com"

config.max_retries = 3
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]

OUTPUT_FILE = "openalex_computer_science.json"
FIELD = "Computer Science"
FIELD_ID = "C41008148"
DATA_NEED = ["id","title","publication_year","type","language","doi",
             "concepts","authorships","locations","primary_location","cited_by_count",
             "primary_topic","keywords"]
MAX_RESULTS = 50
DELAY = 0.2
CHECKPOINT_FILE = "checkpoint.txt"

# Load and Save CheckPoint

In [39]:
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE) as f:
            return f.read().strip() or "*"
    return "*"

def save_checkpoint(cursor):
    with open(CHECKPOINT_FILE, "w") as f:
        f.write(cursor)

# Crawling data

In [54]:
def crawl():
    cursor = load_checkpoint()
    print(f"Crawling OpenAlex for field: {FIELD}")
    works = pyalex.Works().filter(concepts = {"id":FIELD_ID}).select(DATA_NEED).get(per_page = 2,cursor = cursor)
    output_dir = os.path.dirname(OUTPUT_FILE)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    count_result = 0
    result = []
    with open(OUTPUT_FILE, "a", encoding="utf-8") as f_out:
        for record in works:
            print("Crawl record: ",record["id"])
            result.append(record)
            count_result += 1
            if (count_result >= MAX_RESULTS):
                break
            if (count_result % 100 == 0):
                print(f"Crawled {count_result} records ...")
            save_checkpoint(works.meta["next_cursor"])
            time.sleep(DELAY)
    with open(OUTPUT_FILE,"a",encoding = "utf-8") as f_out:
        json.dump(result,f_out,ensure_ascii=False,indent = 4)
    print(f"Saved {count_result} records into {OUTPUT_FILE}")
    


In [58]:
if __name__ == "__main__":
    crawl()

Crawling OpenAlex for field: Computer Science
Crawl record:  https://openalex.org/W2107277218
Crawl record:  https://openalex.org/W1979290264
Saved 2 records into openalex_computer_science.json
