In [None]:
import requests
import psycopg2
import json
import time

# --- 1. Kết nối PostgreSQL ---
conn = psycopg2.connect(
    host="localhost",
    port="5432",
    database="semantic_db",
    user="postgres"
)
cur = conn.cursor()

# --- 2. Tạo bảng papers_full ---
cur.execute("""
CREATE TABLE IF NOT EXISTS papers_full (
    id SERIAL PRIMARY KEY,
    paper_id TEXT UNIQUE,
    title TEXT,
    abstract TEXT,
    year INT,
    venue TEXT,
    url TEXT,
    authors JSONB,
    references_data JSONB,
    citations_data JSONB
);
""")
conn.commit()

# --- 3. Crawl 5 papers demo ---
query = "computer science"
search_url = "https://api.semanticscholar.org/graph/v1/paper/search"
search_params = {"query": query, "limit": 10, "fields": "paperId"}

search_resp = requests.get(search_url, params=search_params)
papers = search_resp.json().get("data", [])

# --- 4. Lấy chi tiết & insert ---
for paper in papers:
    pid = paper.get("paperId")
    if not pid:
        continue

    detail_url = f"https://api.semanticscholar.org/graph/v1/paper/{pid}"
    detail_params = {
        "fields": "title,abstract,year,venue,url,authors,references,citations"
    }
    detail_resp = requests.get(detail_url, params=detail_params)

    if detail_resp.status_code != 200:
        print("Error:", detail_resp.status_code, detail_resp.text)
        continue

    data = detail_resp.json()

    cur.execute("""
    INSERT INTO papers_full (
        paper_id, title, abstract, year, venue, url,
        authors, references_data, citations_data
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (paper_id) DO UPDATE SET
        title = EXCLUDED.title,
        abstract = EXCLUDED.abstract,
        year = EXCLUDED.year,
        venue = EXCLUDED.venue,
        url = EXCLUDED.url,
        authors = EXCLUDED.authors,
        references_data = EXCLUDED.references_data,
        citations_data = EXCLUDED.citations_data;
    """, (
        data.get("paperId"),
        data.get("title"),
        data.get("abstract"),
        data.get("year"),
        data.get("venue"),
        data.get("url"),
        json.dumps(data.get("authors")),
        json.dumps(data.get("references")),
        json.dumps(data.get("citations"))
    ))
    conn.commit()
    time.sleep(0.5)  # tránh rate limit

cur.close()
conn.close()
print(" Đã crawl 10 papers & lưu vào PostgreSQL thành công")


 Đã crawl 10 papers & lưu vào PostgreSQL thành công
