# Imports and config

In [None]:
import cloudscraper
import csv
import json
import pandas as pd
import os
import random
import re
import requests
import time
from bs4 import BeautifulSoup
from datetime import date
from pathlib import Path
from urllib.parse import urljoin

DEFAULT_TIMEOUT = (5, 30)  # (connect timeout, read timeout)

# Load private config
CONFIG_PATH = Path("config.local.json")
if not CONFIG_PATH.exists():
    raise FileNotFoundError(
        "Missing config.local.json. Copy config.example.json -> config.local.json and fill in private values."
    )

cfg = json.loads(CONFIG_PATH.read_text())


BASE_URL = cfg["BASE_URL"]
FLOORPLANS_URL = BASE_URL.rstrip("/") + cfg["FLOORPLANS_PATH"]
OUTPUT_CSV = cfg["OUTPUT_CSV"]

scraper = cloudscraper.create_scraper(browser={"browser": "chrome", "platform": "darwin", "desktop": True}
)
scraper.headers.update({"User-Agent": scraper.headers.get("User-Agent", "Mozilla/5.0")})

# Helper functions

In [None]:
def fetch_soup(url, timeout=DEFAULT_TIMEOUT, max_retries=3):
    last_err = None

    for attempt in range(1, max_retries + 1):
        try:
            resp = scraper.get(
                url,
                timeout=timeout,
            )
            resp.raise_for_status()
            return BeautifulSoup(resp.text, "html.parser")

        except (requests.exceptions.Timeout,
                requests.exceptions.ConnectionError,
                requests.exceptions.ChunkedEncodingError) as e:
            last_err = e
            # bounded retries + backoff so you don't hammer the site
            sleep_s = min(2 ** attempt, 10) + random.random()
            print(f"[fetch_soup] attempt {attempt}/{max_retries} failed for {url}: {e}. Sleeping {sleep_s:.1f}s")
            time.sleep(sleep_s)

    raise RuntimeError(f"fetch_soup failed after {max_retries} attempts for {url}: {last_err}")


def get_floorplan_urls() -> dict:
    """
    From the main floorplans page, find each .fp-container block,
    grab the floorplan code from its <h2>, and the Availability link href.

    Returns: {floorplan_code: absolute_url}
    """
    soup = fetch_soup(FLOORPLANS_URL)
    floorplan_urls = {}

    # Each floorplan is in a div.fp-container
    for fp_div in soup.find_all("div", class_="fp-container"):
        # Get floorplan name/code from the <h3>
        h3 = fp_div.find("h3")
        if not h3:
            continue

        code = (h3.get_text() or "").strip().lower().split()[0]

        url = FLOORPLANS_URL + "/" + code
        floorplan_urls[code] = url

    return floorplan_urls


def parse_units_from_floorplan(code: str, url: str):
    """
    On a floorplan page like /floorplans/s01, extract all:
      - unit id, e.g. "PBA-1203"
      - starting price, e.g. 2535
    Returns list of dicts.
    """
    soup = fetch_soup(url)
    units = []

    # Each unit listing = <tr class="unit-container">
    for card in soup.find_all("tr", class_="unit-container"):
    
        # 1. UNIT ID
        title = card.find("td", class_="td-card-name")
        if title is None:
            raise RuntimeError("Expected <td class='td-card-name'> not found — CSS may have changed")
    
        unit_raw = title.get_text(strip=True)                    # "Apartment:#PBA-1302"
        unit_id = re.search(r':\W*([A-Za-z0-9-]+)', unit_raw).group(1)  # "PBA-1302"
    
        
        # 2. SQ FT
        sqft = card.find("td", class_="td-card-sqft")
        if sqft is None:
            raise RuntimeError("Expected <td class='td-card-sqft'> not found — CSS may have changed")
    
        sqft_raw = sqft.get_text(strip=True)                                                # "Sq. Ft.:513"
        sqft_val = re.search(r':\W*([A-Za-z0-9,-]+)', sqft_raw).group(1).replace(",", "")   # "513"
    
        
        # 3. PRICE
        rent = card.find("td", class_="td-card-rent")
        if rent is None:
            raise RuntimeError("Expected <td class='td-card-rent'> not found — CSS may have changed")
    
        rent_raw = rent.get_text(strip=True)                                                # "Rent:$2,535"
        rent_val = re.search(r':\W*([A-Za-z0-9,-]+)', rent_raw).group(1).replace(",", "")   # "2535"
    
        
        # 4. AVAILABILITY
        availability = card.find("td", class_="td-card-available")
        if availability is None:
            raise RuntimeError("Expected <td class='td-card-available'> not found — CSS may have changed")
    
        availability_raw = availability.get_text(strip=True)                                                # "Rent:$2,535"
        availability_val = re.search(r':\W*([A-Za-z0-9,-]+)', availability_raw).group(1).replace(",", "")   # "2535"
    
        units.append(
            {
                "date": date.today().isoformat(),
                "floorplan": code,
                "unit": unit_id,
                "price": rent_val,
                "availability": availability_val,
                "url": url,
                "sqft": sqft_val
            }
        )

    return units

def ensure_newline(csv_path):
    if not os.path.exists(csv_path):
        return
    with open(csv_path, "rb+") as f:
        f.seek(-1, os.SEEK_END)
        last = f.read(1)
        if last != b"\n":
            f.write(b"\n")


def append_to_csv(rows, csv_path=OUTPUT_CSV):
    if not rows:
        print("No rows to append.")
        return

    file_exists = os.path.exists(csv_path)
    fieldnames = ["date", "floorplan", "unit", "price", "availability", "url", "sqft"]

    with open(csv_path, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="raise")
        if not file_exists:
            writer.writeheader()
        ensure_newline(csv_path)
        writer.writerows(rows)

    print(f"Appended {len(rows)} rows to {csv_path}")


# Run scrape and see results

In [None]:
from pprint import pprint

floorplan_urls = get_floorplan_urls()
print("Found floorplans and URLs:")
pprint(floorplan_urls)

all_rows = []
for code, url in floorplan_urls.items():
    print(f"\nScraping units for floorplan {code} → {url}")
    units = parse_units_from_floorplan(code, url)
    time.sleep(1)
    print(f"  Found {len(units)} units")
    all_rows.extend(units)

print(f"\nTotal rows collected: {len(all_rows)}")
append_to_csv(all_rows)


# Explore data

In [None]:
df = pd.DataFrame(all_rows)
df

# Plot

In [None]:
import plotly.express as px

In [None]:
df_all = pd.read_csv("/Users/Paul/Projects/GitHub/personal-projects/projects/apt_scrape/park_bayonne_prices.csv", index_col=False).drop_duplicates()
df_all.tail()

In [None]:
df_all["date"] = pd.to_datetime(df_all["date"])

In [None]:
px.line(
    df_all,
    x="date",
    y="price",
    color="unit",
).update_traces(mode='lines+markers')