# Thiết lập config và cấu hình chung

In [16]:
import pyalex
from pyalex import config
from itertools import chain
import json
import os
import time
pyalex.config.email = "thuattruongminh@gmail.com"

config.max_retries = 3
config.retry_backoff_factor = 0.1
config.retry_http_codes = [429, 500, 503]

OUTPUT_FILE = "openalex_computer_science.json"
FIELD = "Computer Science"
FIELD_ID = "C41008148"
DATA_NEED = ["id","title","publication_year","type","language","doi",
             "concepts","authorships","locations","primary_location","cited_by_count",
             "primary_topic","keywords"]
MAX_RESULTS = 50
DELAY = 0.2

# Crawling data

In [None]:
def crawl():
    print(f"Crawling OpenAlex for field: {FIELD}")
    works = pyalex.Works().filter(concepts = {"id":FIELD_ID}).select(DATA_NEED).paginate(method = "page",per_page = 100)
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) if "/" in OUTPUT_FILE else None
    count_result = 0
    with open(OUTPUT_FILE, "a", encoding="utf-8") as f_out:
        for page in works:
            for record in page:
                print("Crawl record: ",record["id"])
                json_line = json.dumps(record,ensure_ascii = False)
                f_out.write(json_line + "\n")
                count_result += 1
                if (count_result >= MAX_RESULTS):
                    break
                if (count_result % 100 == 0):
                    print(f"Crawled {count_result} records ...")
                time.sleep(DELAY)
    print(f"Saved {count_result} records into {OUTPUT_FILE}")
    


In [8]:
if __name__ == "__main__":
    crawl()

Crawling OpenAlex for field: Computer Science
Crawl record:  https://openalex.org/W2582743722
Crawl record:  https://openalex.org/W2194775991
Crawl record:  https://openalex.org/W2107277218
Crawl record:  https://openalex.org/W1979290264
Crawl record:  https://openalex.org/W2144634347
Crawl record:  https://openalex.org/W2911964244


KeyboardInterrupt: 

In [17]:
import pyalex
from pyalex import Works
import os
import json
import time


def crawl():
    print(f"Crawling OpenAlex for field: {FIELD}")

    works = Works().filter(concepts={"id": FIELD_ID}).select(DATA_NEED).paginate(
        method="page", per_page=100
    )

    # tạo thư mục nếu cần
    if "/" in OUTPUT_FILE:
        os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

    results = []
    count_result = 0

    try:
        for page_idx, page in enumerate(works, start=1):
            print(f"--- Fetching page {page_idx}, {len(page)} records ---")

            for record in page:
                results.append(record)
                count_result += 1

                if count_result % 50 == 0:
                    print(f"✅ Crawled {count_result} records...")

                if count_result >= MAX_RESULTS:
                    print("Reached MAX_RESULTS limit.")
                    break

                time.sleep(DELAY)  # tránh bị giới hạn tốc độ (rate limit)

            if count_result >= MAX_RESULTS:
                break

        # ===== Ghi toàn bộ dữ liệu vào file JSON =====
        with open(OUTPUT_FILE, "w", encoding="utf-8") as f_out:
            json.dump(results, f_out, ensure_ascii=False, indent=2)

        print(f"\n🎉 Done! Saved {count_result} records into {OUTPUT_FILE}")

    except Exception as e:
        print(f"❌ Error occurred: {e}")

if __name__ == "__main__":
    crawl()


Crawling OpenAlex for field: Computer Science
--- Fetching page 1, 100 records ---
✅ Crawled 50 records...
Reached MAX_RESULTS limit.

🎉 Done! Saved 50 records into openalex_computer_science.json
