In [None]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

URL = "https://www.surf-report.com/meteo-surf/lacanau-s1043.html"

DAY_RE = re.compile(r"\b(Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche)\b", re.IGNORECASE)

def clean(txt: str) -> str:
    return re.sub(r"\s+", " ", txt or "").strip()

def scrape_to_dataframe():
    r = requests.get(URL, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    rows = []

    # --- 1) Trouver chaque bloc "jour" + sa date affichée ---
    day_blocks = []
    seen = set()

    for node in soup.find_all(string=True):
        t = clean(node)
        if DAY_RE.search(t) and 10 <= len(t) <= 40:  # ex: "Samedi 22 Octobre"
            date_txt = t
            if date_txt in seen:
                continue
            seen.add(date_txt)

            # remonter à un conteneur qui contient le tableau (div.content)
            container = node.parent
            for _ in range(7):
                if container and container.find("div", class_="content"):
                    break
                container = container.parent if container else None

            if container:
                content = container.find("div", class_="content")
                if content:
                    day_blocks.append((date_txt, content))

    # (fallback si structure change)
    if not day_blocks:
        for content in soup.select("div.content"):
            day_blocks.append(("UNKNOWN_DATE", content))

    # --- 2) Extraire les lignes horaires pour chaque jour ---
    for date_txt, content in day_blocks:
        for line in content.select("div.line"):
            # ignorer l'entête et les marées
            if line.select_one(".entetes") or "tides" in (line.get("class") or []):
                continue

            time_cell = line.select_one("div.cell.date")
            waves_cell = line.select_one("div.cell.waves")
            wind_cell = line.select_one("div.cell.large-bis-bis.with-border")  # colonne "Vent"

            if not (time_cell and waves_cell and wind_cell):
                continue

            time_txt = clean(time_cell.get_text())

            # Exemple attendu: "0.8m - 0.7m" (on retire espaces)
            wave_size = clean(waves_cell.get_text()).replace(" ", "")

            # Vent: vitesse (nombre coloré)
            speed_span = wind_cell.select_one("div.wind span")
            wind_speed = clean(speed_span.get_text()) if speed_span else None  # "51"
            # direction: flèche (alt de l'image)
            img = wind_cell.select_one("div.wind.img img")
            wind_direction = None
            if img and img.get("alt"):
                wind_direction = clean(img["alt"]).replace("Orientation vent", "").strip()

            rows.append({
                "date": date_txt,
                "time": time_txt,
                "wave_size": wave_size,
                "wind_speed_kmh": wind_speed,
                "wind_direction": wind_direction
            })

    # --- 3) DataFrame final ---
    df = pd.DataFrame(rows)
    return df

# ========= RUN =========
df = scrape_to_dataframe()

# aperçu comme dans l'énoncé
print(df.head(40))
print("\nShape:", df.shape)
###### télécharger sous forme de csv
df.to_csv("surf.csv", index=False, encoding="utf-8")