In [1]:
import requests
import json
import time

API_KEY = "dw35ukm73gr3tteppbfvj9fp"
BASE_URL = "https://ieeexploreapi.ieee.org/api/v1/search/articles"

# Truy vấn lĩnh vực Computer Science
params = {
    "apikey": API_KEY,
    "querytext": "computer science",
    "format": "json",
    "max_records": 100,
    "start_record": 1,
}

all_articles = []

# Lặp nhiều trang để lấy nhiều dữ liệu
for page in range(1, 2):  # ví dụ lấy 5 trang
    params["start_record"] = (page - 1) * 100 + 1
    print(f"Fetching page {page}...")
    response = requests.get(BASE_URL, params=params)
    
    if response.status_code == 200:
        data = response.json()
        if "articles" in data:
            all_articles.extend(data["articles"])
        time.sleep(10)  # tránh bị giới hạn tần suất
    else:
        print(f"Error {response.status_code}: {response.text}")
        break

# Lưu dữ liệu ra file JSON
with open("ieee_computer_science.json", "w", encoding="utf-8") as f:
    json.dump(all_articles, f, indent=4, ensure_ascii=False)

print(f"✅ Đã lưu {len(all_articles)} bài báo vào ieee_computer_science.json")

Fetching page 1...
Error 403: <h1>Developer Inactive</h1>
✅ Đã lưu 0 bài báo vào ieee_computer_science.json


In [6]:
import requests
import xml.etree.ElementTree as ET
import json
import time
import os

# ====== CẤU HÌNH ======
BASE_URL = "https://ora.ox.ac.uk/oai2" # endpoint OAI-PMH của ORA
METADATA_PREFIX = "oai_dc" # Định dạng metadata được quy định của trang
OUTPUT_FILE = "ora_computer_science.json" # file đầu ra
CHECKPOINT_FILE = "ora_checkpoint.txt" # file lưu resumptionToken để resume 
DELAY = 3           # Giãn cách giữa các lần gọi (giây)
MAX_RETRIES = 3     # Số lần thử lại khi lỗi mạng
TIMEOUT = 120       # Giới hạn thời gian mỗi request (giây)

# ====== HÀM GỌI API (CÓ RETRY) ======
# Gọi HTTP tới endpoint OAI-PMH trả về XML text cho page hiện tại
def fetch_records(resumption_token=None):
    # Nếu resumption_token có tồn tại thì tiếp tục dùng để tới trang tiếp theo
    if resumption_token:
        params = {"verb": "ListRecords", "resumptionToken": resumption_token}
    else:
        params = {"verb": "ListRecords", "metadataPrefix": METADATA_PREFIX} # Lấy từ đầu

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = requests.get(BASE_URL, params=params, timeout=TIMEOUT)
            response.raise_for_status() # Ném lỗi HTTP nếu status khác 200
            return response.text # Kết quả trả về
        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as e: # Ném lỗi nếu bị Timeout hoặc lỗi kết nối
            print(f"Attempt {attempt}/{MAX_RETRIES} failed: {e}")
            if attempt < MAX_RETRIES:
                print("Waiting 10s before retry...")
                time.sleep(10)
            else:
                raise Exception("Failed to fetch after multiple retries")
        except Exception as e:
            print(f"Unexpected error: {e}")
            raise

# ====== HÀM PARSE XML ======
def parse_records(xml_text):
    '''
    "oai" là viết tắt cho namespace của OAI-PMH (các thẻ như <record>, <ListRecords>, <resumptionToken> nằm trong đó).
    "dc" là viết tắt cho namespace của Dublin Core (các thẻ metadata như <dc:title>, <dc:creator>…).
    '''
    ns = {
        "oai": "http://www.openarchives.org/OAI/2.0/", 
        "dc": "http://purl.org/dc/elements/1.1/",
    }
    root = ET.fromstring(xml_text) # Phân tích cú pháp XML trả về Element gốc (root) đại diện cho phần tử <OAI-PMH>
    records = [] # Danh sách lưu trữ các record có các trường dữ liệu tương ứng
    for record in root.findall(".//oai:record", ns): # duyệt tất cả để lấy từng record
        data = {}
        for field in ["title", "creator", "subject", "description", "publisher", "date", "identifier"]:
            elements = record.findall(f".//dc:{field}", ns)
            data[field] = [e.text for e in elements if e.text] or None
        dois = []
        if data.get("identifier"):
            for ident in data["identifier"]:
                if "doi.org" in ident or ident.lower().startswith("10."):
                    dois.append(ident)
        data["doi"] = dois or None
        records.append(data)

    # Lấy token phân trang
    token_elem = root.find(".//oai:resumptionToken", ns) # Lấy resumtionToken nếu có
    token = token_elem.text.strip() if token_elem is not None and token_elem.text else None
    return records, token

# ====== HÀM LƯU DỮ LIỆU (APPEND) ======
def append_to_json(new_records, file_path):
    """Lưu thêm records vào file JSON (theo từng đợt)."""
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = []
    data.extend(new_records)
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

# ====== HÀM LƯU / ĐỌC CHECKPOINT ======
def save_checkpoint(token):
    """Ghi token vào file để resume."""
    with open(CHECKPOINT_FILE, "w", encoding="utf-8") as f:
        f.write(token or "")

def load_checkpoint():
    """Đọc token đã lưu (nếu có)."""
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
            token = f.read().strip()
            return token if token else None
    return None
def is_computer_science(record):
    text = " ".join(
        [t for lst in record.values() if lst for t in lst]
    ).lower()
    keywords = ["computer science", "computing", "machine learning",
                "artificial intelligence", "deep learning", "data science"]
    return any(k in text for k in keywords)

# ====== HÀM CHẠY CÀO ======
def crawl_ora():
    token = load_checkpoint() # Xem có token để resume không
    page = 1
    total_records = 0
    max_page = 5
    print("Starting ORA crawler (long-running mode)")
    if token:
        print(f"Resuming from saved token: {token[:40]}...")
    else:
        print("Starting from the beginning...")

    while page <= max_page:
        print(f"\nFetching page {page} ...")
        xml_data = fetch_records(token)
        records, token = parse_records(xml_data)
        print(f"Received {len(records)} records")

        # Lọc Computer Science
        filtered = []
        for r in records:
            if is_computer_science(r):
                filtered.append(r)

        append_to_json(filtered, OUTPUT_FILE)
        total_records += len(filtered)
        print(f"Saved {len(filtered)} new (Total: {total_records})")

        # Lưu checkpoint
        save_checkpoint(token)

        # Dừng nếu hết token
        if not token:
            print("No more pages. Crawl completed.")
            break

        print(f"⏸ Waiting {DELAY}s before next request...\n")
        time.sleep(DELAY)
        page += 1

    print(f"Total {total_records} records saved to {OUTPUT_FILE}")

# ====== MAIN ======
if __name__ == "__main__":
    crawl_ora()


Starting ORA crawler (long-running mode)
Resuming from saved token: oai_dc.f(2022-03-26T08:27:08Z).u(2025-10...

Fetching page 1 ...
Received 500 records
Saved 6 new (Total: 6)
⏸ Waiting 3s before next request...


Fetching page 2 ...
Received 500 records
Saved 3 new (Total: 9)
⏸ Waiting 3s before next request...


Fetching page 3 ...
Received 500 records
Saved 12 new (Total: 21)
⏸ Waiting 3s before next request...


Fetching page 4 ...
Received 500 records
Saved 9 new (Total: 30)
⏸ Waiting 3s before next request...


Fetching page 5 ...
Received 500 records
Saved 11 new (Total: 41)
⏸ Waiting 3s before next request...

Total 41 records saved to ora_computer_science.json


In [1]:
import requests
import pprint
BASE_URL = "https://ora.ox.ac.uk/oai2" # endpoint OAI-PMH của ORA
METADATA_PREFIX = "oai_dc" # Định dạng metadata được quy định của trang
OUTPUT_FILE = "ora_computer_science.json" # file đầu ra
CHECKPOINT_FILE = "ora_checkpoint.txt" # file lưu resumptionToken để resume 
DELAY = 3           # Giãn cách giữa các lần gọi (giây)
MAX_RETRIES = 3     # Số lần thử lại khi lỗi mạng
TIMEOUT = 120 
params = {"verb": "ListRecords", "metadataPrefix": METADATA_PREFIX}
response = requests.get(BASE_URL, params=params, timeout=TIMEOUT)
pprint.pprint(response.text)

('<?xml version="1.0" encoding="UTF-8"?>\n'
 '<?xml-stylesheet type="text/xsl" '
 'href="/assets/oai_dc-613eb2854b4943fcef179f44fef7a179050f9b176ca813e5fe3296246845cd95.xsl" '
 '?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" '
 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
 'xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ '
 'http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2025-10-28T05:43:29Z</responseDate><request '
 'metadataPrefix="oai_dc" '
 'verb="ListRecords">https://ora.ox.ac.uk/oai2</request><ListRecords><record><header><identifier>oai:ora.ox.ac.uk:uuid:000085e0-9a2b-415f-ac4a-b970a4de6d3b</identifier><datestamp>2022-03-26T08:27:08Z</datestamp><setSpec>journal_article</setSpec></header><metadata><oai_dc:dc '
 'xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" '
 'xmlns:dc="http://purl.org/dc/elements/1.1/" '
 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" '
 'xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oa

In [8]:
from pyalex import Works
import json
import time

# Cấu hình chung
OUTPUT_FILE = "openalex_computer_science.json"
FIELD = "Computer Science"
MAX_RESULTS = 200
DELAY = 1

# GỌI API
def crawl():
    print(f"Crawl work field {FIELD}")
    works = Works().search_filter(display_name = FIELD).get(per_page=200)
    all_records = []
    for i, w in enumerate(works):
        record = {
            "id": w.get("id"),
            "title": w.get("title"),
            "abstract": w.get("abstract"),
            "publication_year": w.get("publication_year"),
            "type": w.get("type"),
            "language": w.get("language"),
            "cited_by_count": w.get("cited_by_count"),
            "host_venue": w.get("host_venue", {}).get("display_name"),
            "publisher": w.get("host_venue", {}).get("publisher"),
            "primary_location": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
            "concepts": [c["display_name"] for c in w.get("concepts", [])],
            "authorships": [a["author"]["display_name"] for a in w.get("authorships", [])],
        }
        all_records.append(record)

        # Giới hạn demo
        if (i + 1) >= MAX_RESULTS:
            break

        # In log mỗi 100 bản ghi
        if (i + 1) % 100 == 0:
            print(f"Fetched {i + 1} records...")
            time.sleep(DELAY)

    # Lưu file JSON
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(all_records, f, ensure_ascii=False, indent=2)

    print(f"\n🎉 Done! Saved {len(all_records)} records → {OUTPUT_FILE}")

if __name__ == "__main__":
    crawl()

Crawl work field Computer Science
Fetched 100 records...

🎉 Done! Saved 200 records → openalex_computer_science.json


In [30]:
from pyalex import Institutions,Works
import pprint
results = Works().search_filter(display_name="Computer Science").select(["title","display_name","locations"]).get()
for w in results:
    locations = w.get("locations",[])
    publisher = []
    for loc in locations:
        source = loc.get("source",{}) 
        if source:
            name = source.get("display_name")
        if name:
            publisher.append(name)
    print(f"Tile: {w.get("title")}")
    pprint.pprint(f"Publisher: {publisher}")
    print(f"Display name: {w.get("display_name")}")
    print(10*"-")



Tile: Lecture Notes in Computer Science 1205
("Publisher: ['Industrial Robot the international journal of robotics research "
 "and application', 'CiteSeer X (The Pennsylvania State University)']")
Display name: Lecture Notes in Computer Science 1205
----------
Tile: Handbook of theoretical computer science
"Publisher: ['CiteSeer X (The Pennsylvania State University)']"
Display name: Handbook of theoretical computer science
----------
Tile: Calibration of the Computer Science and Applications, Inc. accelerometer
("Publisher: ['Medicine & Science in Sports & Exercise', 'Medicine & Science "
 "in Sports & Exercise', 'PubMed']")
Display name: Calibration of the Computer Science and Applications, Inc. accelerometer
----------
Tile: Journal of Universal Computer Science
("Publisher: ['Zenodo (CERN European Organization for Nuclear Research)', "
 "'CiteSeer X (The Pennsylvania State University)']")
Display name: Journal of Universal Computer Science
----------
Tile: Algorithms on Strings, Tr