In [6]:
import time, json, io
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
import requests
from PyPDF2 import PdfReader

from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager


BASE_URL = "https://projekty.ncn.gov.pl"
START_URL = "https://projekty.ncn.gov.pl/index.php?jednostka=Uniwersytet+Adama+Mickiewicza&jednostka_miasto=&jednostka_wojewodztwo=&kierownik=&kierownik_plec=&kierownik_tytul=&status=&projekt=&kwotaprzyznanaod=8+375&kwotaprzyznanado=7+209+600&typkonkursu=&konkurs=&grupa=&panel=&slowokluczowe=&aparatura="

OUTPUT_JSON = "projekty_ncn_uam.json"
CHECKPOINT_EVERY = 50  # co ile projektów robić checkpoint


# --- driver Firefox ---
def get_driver(headless=True):
    opts = webdriver.FirefoxOptions()
    if headless:
        opts.add_argument("--headless")
    return webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=opts)


# --- Pomocnicze: ustawienie parametru w URL ---
def add_or_replace_query_param(url, key, value):
    parsed = urlparse(url)
    qs = parse_qs(parsed.query, keep_blank_values=True)
    qs[key] = [str(value)]
    new_query = urlencode(qs, doseq=True)
    new_parsed = parsed._replace(query=new_query)
    return urlunparse(new_parsed)


# --- Pobieranie PDF z cookies Selenium ---
def extract_pdf_text_with_cookies(driver, pdf_url):
    if not pdf_url:
        return None
    try:
        cookies = {c["name"]: c["value"] for c in driver.get_cookies()}
        headers = {"User-Agent": "Mozilla/5.0", "Referer": BASE_URL}
        r = requests.get(pdf_url, headers=headers, cookies=cookies, timeout=60)
        r.raise_for_status()
        reader = PdfReader(io.BytesIO(r.content))
        text = ""
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
        return text.strip() or None
    except Exception as e:
        print(f"[PDF] Błąd przy {pdf_url}: {e}")
        return None


# --- Zbieranie wszystkich linków (strona=1,2,...) ---
def collect_all_project_links_by_page_param(driver, start_url, max_pages=None):
    all_links, seen = [], set()
    page = 1

    while True:
        if max_pages and page > max_pages:
            break

        page_url = add_or_replace_query_param(start_url, "strona", page)
        print(f"[PAGE] {page_url}")
        driver.get(page_url)

        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
        except:
            print("  brak treści, przerywam")
            break

        anchors = driver.find_elements(By.CSS_SELECTOR, "a[href*='projekt_id=']")
        page_links = list({a.get_attribute("href") for a in anchors if a.get_attribute("href")})

        if not page_links:
            print("  brak linków, koniec")
            break

        new_count = 0
        for href in page_links:
            if href not in seen:
                seen.add(href)
                all_links.append(href)
                new_count += 1

        print(f"  {len(page_links)} linków na stronie, nowych: {new_count}, łącznie: {len(all_links)}")

        if new_count == 0:
            break

        page += 1
        time.sleep(0.3)

    return all_links


# --- Parser projektu ---
def parse_project(driver, url):
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".important")))
    data = {"url": url}

    def safe(css=None, xpath=None):
        try:
            if css:
                return driver.find_element(By.CSS_SELECTOR, css).text.strip()
            if xpath:
                return driver.find_element(By.XPATH, xpath).text.strip()
        except:
            return None

    data["tytul"] = safe(css=".important h2")
    data["id"] = safe(css=".important p.row2")

    data["slowa_kluczowe"] = [el.text.strip() for el in driver.find_elements(By.CSS_SELECTOR, "p.row2 span.frazy a") if el.text.strip()]
    data["deskryptory"] = [el.text.strip() for el in driver.find_elements(By.CSS_SELECTOR, ".important ul li") if el.text.strip()]
    data["panel"] = safe(xpath="//p[strong[contains(text(),'Panel')]]/following-sibling::p[1]") or safe(css="p.wciecie")
    data["jednostka"] = safe(xpath="//p[strong[contains(text(),'Jednostka realizująca')]]/following-sibling::p[1]")
    data["kierownik"] = safe(xpath="//p[strong[contains(text(),'Kierownik projektu')]]/following-sibling::p[1]")

    # szczegóły
    data.update({"kwota": None, "start": None, "koniec": None, "status": None})
    for el in driver.find_elements(By.CSS_SELECTOR, ".strona p"):
        txt = el.text.strip()
        if txt.startswith("Przyznana kwota"): data["kwota"] = txt
        if txt.startswith("Rozpoczęcie projektu"): data["start"] = txt
        if txt.startswith("Zakończenie projektu"): data["koniec"] = txt
        if txt.startswith("Status projektu"): data["status"] = txt

    # pdf
    try:
        pdf_url = driver.find_element(By.CSS_SELECTOR, "a[href$='.pdf']").get_attribute("href")
    except:
        pdf_url = None
    data["opis_pdf"] = pdf_url
    data["streszczenie_pdf"] = extract_pdf_text_with_cookies(driver, pdf_url) if pdf_url else None

    # publikacje
    publikacje = []
    for pub in driver.find_elements(By.CSS_SELECTOR, "li.publikacje"):
        pub_data = {"tytul": "", "autorzy": "", "czasopismo": "", "doi": None}
        try: pub_data["tytul"] = pub.find_element(By.CSS_SELECTOR, ".tytul strong").text.strip()
        except: pass
        try: pub_data["autorzy"] = pub.find_element(By.CSS_SELECTOR, ".autorzy em").text.strip()
        except: pass
        try: pub_data["czasopismo"] = pub.find_element(By.CSS_SELECTOR, ".czasopismo em").text.strip()
        except: pass
        try: pub_data["doi"] = pub.find_element(By.CSS_SELECTOR, ".doi .prawa").text.strip()
        except: pass

        if any([pub_data["tytul"], pub_data["autorzy"], pub_data["czasopismo"], pub_data["doi"]]):
            publikacje.append(pub_data)
    data["publikacje"] = publikacje

    return data


# --- główna funkcja ---
def run_all(max_pages=None, headless=True):
    driver = get_driver(headless)
    try:
        links = collect_all_project_links_by_page_param(driver, START_URL, max_pages=max_pages)
        print(f"\nZnaleziono {len(links)} linków.\n")

        results = []
        for i, url in enumerate(links, 1):
            print(f"[{i}/{len(links)}] {url}")
            try:
                results.append(parse_project(driver, url))
            except Exception as e:
                results.append({"url": url, "error": str(e)})
                print(f"  !! błąd: {e}")

            if i % CHECKPOINT_EVERY == 0:
                with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
                    json.dump(results, f, ensure_ascii=False, indent=2)
                print(f"  >> checkpoint zapisany ({i})")

        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"\nGotowe! Zapisano {len(results)} rekordów do {OUTPUT_JSON}")

    finally:
        driver.quit()

#run_all(max_pages=2)
run_all()


[PAGE] https://projekty.ncn.gov.pl/index.php?jednostka=Uniwersytet+Adama+Mickiewicza&jednostka_miasto=&jednostka_wojewodztwo=&kierownik=&kierownik_plec=&kierownik_tytul=&status=&projekt=&kwotaprzyznanaod=8+375&kwotaprzyznanado=7+209+600&typkonkursu=&konkurs=&grupa=&panel=&slowokluczowe=&aparatura=&strona=1
  50 linków na stronie, nowych: 50, łącznie: 50
[PAGE] https://projekty.ncn.gov.pl/index.php?jednostka=Uniwersytet+Adama+Mickiewicza&jednostka_miasto=&jednostka_wojewodztwo=&kierownik=&kierownik_plec=&kierownik_tytul=&status=&projekt=&kwotaprzyznanaod=8+375&kwotaprzyznanado=7+209+600&typkonkursu=&konkurs=&grupa=&panel=&slowokluczowe=&aparatura=&strona=2
  50 linków na stronie, nowych: 50, łącznie: 100
[PAGE] https://projekty.ncn.gov.pl/index.php?jednostka=Uniwersytet+Adama+Mickiewicza&jednostka_miasto=&jednostka_wojewodztwo=&kierownik=&kierownik_plec=&kierownik_tytul=&status=&projekt=&kwotaprzyznanaod=8+375&kwotaprzyznanado=7+209+600&typkonkursu=&konkurs=&grupa=&panel=&slowokluczowe=