In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urlparse

DISTRICT_URLS = [
    "https://krisha.kz/prodazha/kvartiry/almaty-alatauskij/",
    "https://krisha.kz/prodazha/kvartiry/almaty-almalinskij/",
    "https://krisha.kz/prodazha/kvartiry/almaty-aujezovskij/",
    "https://krisha.kz/prodazha/kvartiry/almaty-bostandykskij/",
    "https://krisha.kz/prodazha/kvartiry/almaty-zhetysuskij/",
    "https://krisha.kz/prodazha/kvartiry/almaty-medeuskij/",
    "https://krisha.kz/prodazha/kvartiry/almaty-nauryzbajskiy/",
    "https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/",
]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/122.0.0.0 Safari/537.36",
    "Accept-Language": "ru-RU,ru;q=0.9",
}


def get_slug_from_url(url: str) -> str:
    path = urlparse(url).path.strip("/")
    return path.split("/")[-1]


def scrape_district(base_url: str, max_pages: int = 5):
    slug = get_slug_from_url(base_url)
    print(f"\n== Район: {slug} ==")

    product_ids = set()

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"

        resp = requests.get(url, headers=headers, timeout=10)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")

        for div in soup.find_all("div", attrs={"data-product-id": True}):
            pid = div.get("data-product-id")
            if pid:
                product_ids.add(pid)

    print(f"  Найдено объявлений: {len(product_ids)}")

    filename = f"{slug}.csv"
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["product_id"])
        for pid in sorted(product_ids):
            writer.writerow([pid])

    print(f"  Сохранено в файл: {filename}")


if __name__ == "__main__":
    for url in DISTRICT_URLS:
        scrape_district(url, max_pages=340)



== Район: almaty-turksibskij ==
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=1
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=2
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=3
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=4
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=5
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=6
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=7
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=8
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=9
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=10
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-turksibskij/?page=11
  Парсим страницу: https://krisha.kz/prodazha/kvartiry/almaty-t

In [None]:
import csv
import glob
import time
import random
from pathlib import Path

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://krisha.kz/a/show/{id}"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "ru-RU,ru;q=0.9",
}

SESSION = requests.Session()


def get_jsdata_raw(advert_id: str) -> str:
    url = BASE_URL.format(id=advert_id)
    resp = SESSION.get(url, headers=HEADERS, timeout=15)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")
    script = soup.find("script", id="jsdata")
    if script is None:
        print(f"[WARN] jsdata не найдено для id={advert_id}")
        return ""

    return script.get_text()


def process_csv(src_path: str):
    src_path = Path(src_path)
    dst_path = src_path.with_name(src_path.stem + "_detailed.csv")

    print(f"\n=== Обрабатываем {src_path.name} -> {dst_path.name} ===")

    processed_ids = set()
    if dst_path.exists():
        with dst_path.open("r", newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                pid = row.get("id")
                if pid:
                    processed_ids.add(pid)

    ids_to_process = []
    with src_path.open("r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            pid = row.get("product_id")
            if not pid:
                continue
            pid = pid.strip()
            if not pid or pid in processed_ids:
                continue
            ids_to_process.add(pid) if isinstance(ids_to_process, set) else ids_to_process.append(pid)

    total = len(ids_to_process)
    if total == 0:
        print("Все id уже обработаны, пропускаем.")
        return

    print(f"Нужно обработать ещё {total} объявлений.")

    file_exists = dst_path.exists()
    with dst_path.open("a", newline="", encoding="utf-8") as f_out:
        writer = csv.DictWriter(f_out, fieldnames=["id", "jsdata"])
        if not file_exists:
            writer.writeheader()

        processed_now = 0

        for idx, pid in enumerate(ids_to_process, start=1):
            try:
                jsdata = get_jsdata_raw(pid)
            except Exception as e:
                print(f"[ERROR] id={pid}: {e}")
                continue

            writer.writerow({"id": pid, "jsdata": jsdata})
            processed_now += 1

            if processed_now % 50 == 0:
                f_out.flush()
                print(
                    f"[{src_path.name}] обработано ещё 50; "
                    f"всего {processed_now} из {total}, последний id={pid}"
                )

            time.sleep(random.uniform(0.5, 1.5))

        f_out.flush()
        print(
            f"Готово: файл {dst_path.name}, обработано {processed_now} из {total} "
            f"(включая старые уже было {len(processed_ids)})"
        )


def main():
    csv_files = sorted(glob.glob("almaty-*.csv"))
    if not csv_files:
        print("Нет файлов almaty-*.csv")
        return

    for path in csv_files:
        process_csv(path)


if __name__ == "__main__":
    main()



=== Обрабатываем almaty-medeuskij.csv -> almaty-medeuskij_detailed.csv ===
Нужно обработать ещё 3698 объявлений.
[almaty-medeuskij.csv] обработано ещё 50; всего 50 из 3698, последний id=1000649297
[almaty-medeuskij.csv] обработано ещё 50; всего 100 из 3698, последний id=1001061774
[almaty-medeuskij.csv] обработано ещё 50; всего 150 из 3698, последний id=1001697443
[almaty-medeuskij.csv] обработано ещё 50; всего 200 из 3698, последний id=1002127494
[almaty-medeuskij.csv] обработано ещё 50; всего 250 из 3698, последний id=1002600085
[almaty-medeuskij.csv] обработано ещё 50; всего 300 из 3698, последний id=1002860818
[almaty-medeuskij.csv] обработано ещё 50; всего 350 из 3698, последний id=1003321267
[almaty-medeuskij.csv] обработано ещё 50; всего 400 из 3698, последний id=1003565927
[almaty-medeuskij.csv] обработано ещё 50; всего 450 из 3698, последний id=1003873758
[almaty-medeuskij.csv] обработано ещё 50; всего 500 из 3698, последний id=1004073157
[almaty-medeuskij.csv] обработано ещё

In [None]:
import csv
import glob
import json
import random
import time
from pathlib import Path

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://krisha.kz/a/show/{id}"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/125.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "ru-RU,ru;q=0.9",
}

SESSION = requests.Session()


def parse_offer_params(advert_id: str):
    url = BASE_URL.format(id=advert_id)
    resp = SESSION.get(url, headers=HEADERS, timeout=15)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    short_params = {}
    full_params = {}

    for item in soup.select(".offer__short-description .offer__info-item"):
        data_name = item.get("data-name")
        title_el = item.select_one(".offer__info-title")
        value_el = item.select_one(".offer__advert-short-info")

        title = title_el.get_text(strip=True) if title_el else None
        value = value_el.get_text(" ", strip=True) if value_el else None


        key = data_name or title
        if key:
            short_params[key] = {
                "title": title,
                "value": value,
            }

    for dl in soup.select(".offer__parameters dl"):
        dt = dl.select_one("dt")
        dd = dl.select_one("dd")
        if not dt or not dd:
            continue

        data_name = dt.get("data-name")
        title = dt.get_text(strip=True)
        value = dd.get_text(" ", strip=True)

        key = data_name or title
        if key:
            full_params[key] = {
                "title": title,
                "value": value,
            }

    return short_params, full_params


def process_csv(src_path: str):
    src_path = Path(src_path)
    dst_path = src_path.with_name(src_path.stem + "_params.csv")

    print(f"\n=== Обрабатываем {src_path.name} -> {dst_path.name} ===")

    processed_ids = set()
    if dst_path.exists():
        with dst_path.open("r", newline="", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for row in reader:
                pid = row.get("id")
                if pid:
                    processed_ids.add(pid)

    ids_to_process = []
    with src_path.open("r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            pid = row.get("product_id")
            if not pid:
                continue
            pid = pid.strip()
            if not pid or pid in processed_ids:
                continue
            ids_to_process.append(pid)

    total = len(ids_to_process)
    if total == 0:
        print("Все id уже обработаны, пропускаем.")
        return

    print(f"Нужно обработать ещё {total} объявлений.")

    file_exists = dst_path.exists()
    with dst_path.open("a", newline="", encoding="utf-8") as f_out:
        writer = csv.DictWriter(
            f_out,
            fieldnames=["id", "short_params_json", "full_params_json"],
        )
        if not file_exists:
            writer.writeheader()

        processed_now = 0

        for idx, pid in enumerate(ids_to_process, start=1):
            try:
                short_params, full_params = parse_offer_params(pid)
            except Exception as e:
                print(f"[ERROR] id={pid}: {e}")
                continue

            writer.writerow(
                {
                    "id": pid,
                    "short_params_json": json.dumps(
                        short_params, ensure_ascii=False
                    ),
                    "full_params_json": json.dumps(
                        full_params, ensure_ascii=False
                    ),
                }
            )
            processed_now += 1

            if processed_now % 50 == 0:
                f_out.flush()
                print(
                    f"[{src_path.name}] обработано ещё 50; "
                    f"всего {processed_now} из {total}, последний id={pid}"
                )

            time.sleep(random.uniform(0.5, 1.5))

        f_out.flush()
        print(
            f"Готово: файл {dst_path.name}, обработано новых {processed_now} из {total} "
            f"(включая уже существующие было {len(processed_ids)})"
        )


def main():
    csv_files = sorted(glob.glob("almaty-*.csv"))
    if not csv_files:
        print("Файлы almaty-*.csv не найдены в текущей папке")
        return

    print("Найдено файлов:", ", ".join(Path(p).name for p in csv_files))

    for path in csv_files:
        process_csv(path)

    print("\nВсе файлы обработаны.")


if __name__ == "__main__":
    main()


Файлы almaty-*.csv не найдены в текущей папке


In [None]:
import pandas as pd
import glob
import os
import re

DATA_DIR = "."

files = glob.glob(os.path.join(DATA_DIR, "almaty-*.csv"))
files

def clean_generic(value):
    if pd.isna(value):
        return None
    s = str(value)
    s = s.strip()
    s = s.replace("\n", "").replace("\r", "")
    return s

def clean_jsdata(value):
    if pd.isna(value):
        return None
    s = str(value)
    s = re.sub(r'^window\.data\s*=\s*', '', s.strip())
    if s.endswith(";"):
        s = s[:-1]
    s = s.strip()
    s = s.replace("\n", "").replace("\r", "")
    return s

groups = {}

pattern = re.compile(r"almaty-(.+?)(?:_(detailed|params))?\.csv$")

for path in files:
    name = os.path.basename(path)
    m = pattern.match(name)
    if not m:
        continue
    district = m.group(1)
    kind = m.group(2) or "base"

    groups.setdefault(district, {})[kind] = path

groups

all_rows = []

for district, kinds in groups.items():
    base_path     = kinds.get("base")
    detailed_path = kinds.get("detailed")
    params_path   = kinds.get("params")

    if not (base_path and detailed_path and params_path):
        print(f"Пропускаю {district}: не хватает одного из файлов")
        continue

    base_df = pd.read_csv(base_path)
    if "product_id" in base_df.columns:
        base_df = base_df.rename(columns={"product_id": "id"})
    elif "id" in base_df.columns:
        pass
    else:
        print(f"{district}: в base-файле нет product_id/id, пропускаю")
        continue

    base_df = base_df[["id"]]

    detailed_df = pd.read_csv(detailed_path)
    detailed_df = detailed_df[["id", "jsdata"]]

    params_df = pd.read_csv(params_path)
    params_df = params_df[["id", "short_params_json", "full_params_json"]]

    merged = (
        base_df
        .merge(detailed_df, on="id", how="inner")
        .merge(params_df, on="id", how="inner")
    )

    merged["district"] = district

    merged["id"] = merged["id"].apply(clean_generic)
    merged["district"] = merged["district"].apply(clean_generic)
    merged["jsdata"] = merged["jsdata"].apply(clean_jsdata)
    merged["short_params_json"] = merged["short_params_json"].apply(clean_generic)
    merged["full_params_json"] = merged["full_params_json"].apply(clean_generic)

    merged = merged[["id", "district", "jsdata", "short_params_json", "full_params_json"]]

    all_rows.append(merged)

if all_rows:
    result_df = pd.concat(all_rows, ignore_index=True)
else:
    result_df = pd.DataFrame(columns=["id", "district", "jsdata",
                                      "short_params_json", "full_params_json"])

result_df.head()


output_path = os.path.join(DATA_DIR, "almaty_merged.csv")
result_df.to_csv(output_path, index=False)
output_path




In [None]:
import pandas as pd
import json

INPUT_FILE = "almaty_merged.csv"
OUTPUT_FILE = "almaty_parsed.csv"

def safe_json_load(raw, col_name, row_id, err_dict):
    if pd.isna(raw) or raw == "" or raw == "0":
        return {}
    if isinstance(raw, (dict, list)):
        return raw
    try:
        return json.loads(raw)
    except Exception as e:
        err_dict[col_name].append((row_id, str(e)))
        return None


def get_nested(d, path, default=None):
    cur = d
    for key in path:
        if isinstance(cur, dict) and key in cur:
            cur = cur[key]
        else:
            return default
    return cur

df = pd.read_csv(INPUT_FILE, low_memory=False)

errors = {
    "jsdata_parse": [],
    "short_params_parse": [],
    "full_params_parse": [],
    "adverts_len": [],
}

valid_rows = []
bad_row_ids = set()

for idx, row in df.iterrows():
    row_id = row.get("id", idx)

    # jsdata
    js = safe_json_load(row.get("jsdata"), "jsdata_parse", row_id, errors)
    if js is None:
        bad_row_ids.add(row_id)
        continue

    adverts = js.get("adverts")
    if not isinstance(adverts, list) or len(adverts) != 1:
        errors["adverts_len"].append((row_id, f"len(adverts) = {len(adverts) if isinstance(adverts, list) else 'not list'}"))
        bad_row_ids.add(row_id)
        continue

    advert_root = js.get("advert", {}) or {}
    advert0 = adverts[0] or {}

    short = safe_json_load(row.get("short_params_json"), "short_params_parse", row_id, errors)
    if short is None:
        bad_row_ids.add(row_id)
        continue

    full = safe_json_load(row.get("full_params_json"), "full_params_parse", row_id, errors)
    if full is None:
        bad_row_ids.add(row_id)
        continue

    if row_id in bad_row_ids:
        continue

    out = {}

    out["id"] = row.get("id")
    out["district"] = row.get("district")

    out["price"] = advert_root.get("price")
    out["title"] = advert_root.get("title")
    out["addressTitle"] = advert_root.get("addressTitle")
    out["square"] = advert_root.get("square")
    out["rooms"] = advert_root.get("rooms")
    out["ownerName"] = advert_root.get("ownerName")

    out["lat"] = get_nested(advert_root, ["map", "lat"])
    out["lon"] = get_nested(advert_root, ["map", "lon"])

    out["priceM2"] = advert0.get("priceM2")
    out["daysInLive"] = advert0.get("daysInLive")
    out["description"] = advert0.get("description")
    out["isOwner"] = get_nested(advert0, ["owner", "isOwner"])
    out["addedAt"] = advert0.get("addedAt")
    out["createdAt"] = advert0.get("createdAt")

    combined_params = {}
    if isinstance(short, dict):
        combined_params.update(short)
    if isinstance(full, dict):
        combined_params.update(full)

    for key, val in combined_params.items():
        if isinstance(val, dict):
            out[f"{key}_title"] = val.get("title")
            out[f"{key}_value"] = val.get("value")

    valid_rows.append(out)

parsed_df = pd.DataFrame(valid_rows)

parsed_df.to_csv(OUTPUT_FILE, index=False)

total_rows = len(df)
valid_count = len(parsed_df)
bad_count = total_rows - valid_count

print(f"Всего строк во входном файле: {total_rows}")
print(f"Успешно распарсено и добавлено в выходной файл: {valid_count}")
print(f"Проблемных строк (исключены из результата): {bad_count}\n")

print("Ошибки парсинга jsdata:", len(errors["jsdata_parse"]))
print("Ошибки парсинга short_params_json:", len(errors["short_params_parse"]))
print("Ошибки парсинга full_params_json:", len(errors["full_params_parse"]))
print("Строки с len(jsdata.adverts) != 1:", len(errors["adverts_len"]))

print("\nПервые 20 проблемных строк (id, причина):")
all_errs = (
    [("jsdata_parse",) + e for e in errors["jsdata_parse"]] +
    [("short_params_parse",) + e for e in errors["short_params_parse"]] +
    [("full_params_parse",) + e for e in errors["full_params_parse"]] +
    [("adverts_len",) + e for e in errors["adverts_len"]]
)
for err in all_errs[:20]:
    kind, row_id, msg = err
    print(f"{kind}: id={row_id}, {msg}")

print("\nКоличество столбцов в almaty_parsed.csv:", len(parsed_df.columns))
