In [None]:
import time
import requests
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta

BASE_URL = "https://api.hh.ru"

HEADERS = {
    "User-Agent": "HSE-DataAnalyst-Project/1.0 (tg: @oa_sht)"
}

MOSCOW_AREA_ID = 1


def hh_get(url: str, params: dict | None = None, max_retries: int = 6) -> dict:
    for attempt in range(1, max_retries + 1):
        r = requests.get(url, headers=HEADERS, params=params, timeout=30)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 500, 502, 503, 504):
            retry_after = r.headers.get("Retry-After")
            wait = float(retry_after) if retry_after else min(10.0, 0.5 * attempt)
            time.sleep(wait)
            continue
        raise RuntimeError(f"HH API error {r.status_code}: {r.text[:500]}")
    raise RuntimeError(f"Failed after {max_retries} retries: {url}")


def iso(dt: datetime) -> str:
    return dt.strftime("%Y-%m-%dT%H:%M:%S")


def iter_day_windows(start_dt: datetime, end_dt: datetime):
    cur = start_dt
    while cur < end_dt:
        nxt = min(cur + timedelta(days=1), end_dt)
        yield cur, nxt
        cur = nxt


def search_ids(date_from_dt: datetime, date_to_dt: datetime, area: int = MOSCOW_AREA_ID,
               per_page: int = 100, max_pages: int = 200) -> list[str]:
    ids = []
    for page in range(max_pages):
        params = {
            "text": " ",
            "area": area,
            "per_page": per_page,
            "page": page,
            "date_from": iso(date_from_dt),
            "date_to": iso(date_to_dt),
        }
        data = hh_get(f"{BASE_URL}/vacancies", params=params)
        items = data.get("items", [])
        if not items:
            break

        ids.extend(str(x["id"]) for x in items)

        pages_total = data.get("pages")
        if pages_total is not None and page >= pages_total - 1:
            break

        time.sleep(0.2)

    return list(dict.fromkeys(ids))


def fetch_vacancy(vacancy_id: str) -> dict:
    return hh_get(f"{BASE_URL}/vacancies/{vacancy_id}")


def flatten(v: dict) -> dict:
    salary = v.get("salary") or {}
    area = v.get("area") or {}
    employer = v.get("employer") or {}
    experience = v.get("experience") or {}
    employment = v.get("employment") or {}
    schedule = v.get("schedule") or {}

    roles = v.get("professional_roles") or []
    role_names = [r.get("name") for r in roles if r.get("name")]
    main_role = role_names[0] if role_names else None

    key_skills = v.get("key_skills") or []
    skills = [s.get("name") for s in key_skills if s.get("name")]

    return {
        "id": v.get("id"),
        "published_at": v.get("published_at"),
        "name": v.get("name"),
        "area_id": area.get("id"),
        "area_name": area.get("name"),
        "employer_id": employer.get("id"),
        "employer_name": employer.get("name"),
        "salary_from": salary.get("from"),
        "salary_to": salary.get("to"),
        "salary_currency": salary.get("currency"),
        "salary_gross": salary.get("gross"),
        "experience": experience.get("id"),
        "employment": employment.get("id"),
        "schedule": schedule.get("id"),
        "professional_role_main": main_role,
        "professional_roles_all": ", ".join(role_names) if role_names else None,
        "key_skills": ", ".join(skills) if skills else None,
        "key_skills_count": len(skills),
        "description": v.get("description"),
        "alternate_url": v.get("alternate_url"),
    }


def main():
    start = datetime(2026, 1, 20, 0, 0, 0)
    end = datetime(2026, 2, 3, 0, 0, 0)

    all_ids = []
    print("Collecting vacancy IDs day-by-day...")
    for d_from, d_to in iter_day_windows(start, end):
        ids = search_ids(d_from, d_to, area=MOSCOW_AREA_ID)
        all_ids.extend(ids)
        print(f"{d_from.date()} -> {len(ids)} ids (cumulative {len(all_ids)})")
        time.sleep(0.2)

    all_ids = list(dict.fromkeys(all_ids))
    print("Unique vacancy IDs:", len(all_ids))

    rows = []
    errors = 0
    for vid in tqdm(all_ids, desc="Downloading vacancy details"):
        try:
            rows.append(flatten(fetch_vacancy(vid)))
        except Exception as e:
            errors += 1
            print(f"\nFailed id={vid}: {e}")
        time.sleep(0.2)

    df = pd.DataFrame(rows)
    out = "vacancies_moscow_last30days.csv"
    df.to_csv(out, index=False, encoding="utf-8-sig")
    print(f"\nSaved {df.shape[0]} rows, errors={errors} -> {out}")


if __name__ == "__main__":
    main()

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import shutil

shutil.copy(
    "vacancies_moscow_last30days.csv",
    "/content/drive/MyDrive/vacancies_moscow_last30days.csv"
)


'/content/drive/MyDrive/vacancies_moscow_last30days.csv'