In [31]:
import os
import csv
import re
import requests
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
import sys

# -------------------- CONFIG --------------------
headers = {
    "Content-Type": "application/json",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
    "Access-Control-Allow-Origin": "*"
}

input_csv = "data/annonces_link.csv"    
output_csv = "data/results.csv"          

# -------------------- UTILITIES --------------------
def remove_whitespace(s: str, keep_spaces: bool = True) -> str:
    if not s:
        return ""
    s = re.sub(r"[^\S ]+", " ", s)
    s = re.sub(r" +", " ", s)
    s = re.sub(r"\\", "", s)
    return s.strip()

def get_specific_element(regex: str, text: str):
    if not text:
        return None
    m = re.search(regex, text)
    return m.group(0) if m else None

def safe_text(element, keep_spaces=False):
    if not element:
        return ""
    try:
        e = element.text
    except AttributeError:
        e = element
    return remove_whitespace(e, keep_spaces)

def stringToTimeStamp(date_str:str):
    dt = datetime.strptime(date_str, "%d/%m/%Y %H:%M")
    return int(dt.timestamp())
    

def safe_find_text(soup, tag, attrs=None, keep_spaces=False):
    element = soup.find(tag, attrs=attrs)
    return safe_text(element, keep_spaces) if element else ""

def scrape_annonce(url, htmlContent):
    """
    Scrapes a ParuVendu annonce page:
    - Selenium: handles dynamic content & clicks 'Lire plus'
    - BeautifulSoup: parses static content
    """
    # ---------- SELENIUM ----------
    opt = Options()
    opt.add_argument("start-maximized")
    opt.add_argument("--lang=en-US")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=opt
    )

    driver.get(url)
    sleep(2)

    # Accept cookies
    try:
        btn = driver.find_element(By.XPATH, "//button[text()='Accepter']")
        driver.execute_script("arguments[0].click();", btn)
        sleep(1)
    except:
        pass

    # Cliquer sur  "Lire plus" si ca exite 
    try:
        read_more = driver.find_element(By.ID, "linkAnnonceTrunc")
        driver.execute_script("arguments[0].click();", read_more)
        sleep(1)
    except:
        pass

    # Extraire la description
    try:
        element = driver.find_element(By.XPATH, "//div[@class='im12_txt_ann im12_txt_ann_auto']")
        description = element.text
    except:
        description = ""

    driver.quit()

    # ---------- BEAUTIFULSOUP ----------
    data = {}
    if htmlContent:
        soup = BeautifulSoup(htmlContent, 'html.parser')
        data["title"] = safe_find_text(soup, "span", {"id": "detail_h1"}, True)
        data["location"] = safe_find_text(soup, "span", {"id": "detail_loc"})
        data["descriptionTitle"] = safe_find_text(soup, "h2", {"class": "autodetail-titre sepdetail14-ssbordure"})
        data["description"] = safe_text(description)
        data["nbp"] = get_specific_element(r"\d+", safe_find_text(soup, "li", {"class": "nbp"}))
        data["surf"] = get_specific_element(r"\d+", safe_find_text(soup, "li", {"class": "surf"}))
        data["prix"] = get_specific_element(r"\d+(?: \d+)*", safe_find_text(soup, "div", {"class": "prixactionalerte-box"}))
        data["enseigneInfosvendeur"] = safe_find_text(soup, "p", {"class": "enseigne-infosvendeur"})
        # data["enseigneInfosvendeur"] = safe_find_text(soup, "p", {"class": "enseigne-infosvendeur"})

        uls = soup.find_all("ul", {"class": "crit-alignbloc"})
        
        for idx, ul in enumerate(uls, 1):
            datas = {}
            li_items = ul.find_all("li")
        
            for li in li_items:
        
                strong = li.find("strong")
                if strong:  
                    # category is inside <strong>
                    category = strong.get_text(strip=True)
                else:
                    # category is plain text before spans
                    category = li.get_text(" ", strip=True).split(" ")[0]
        
                values = [span.get_text(strip=True) for span in li.find_all("span")]
                datas[category] = values
        
        data["agencement"] = ",".join(datas.get("Agencement", []))
        data["general"] = ",".join(datas.get("Général", []))
        data["annexe"] = ",".join(datas.get("Annexes", []))

        data["dependance"] = ",".join(datas.get("Dépendance", []))

        # Tranformer la date en timestamp
        raw_date = datas.get("Mise", [])
        if raw_date:
            date_str = raw_date[0].replace(" à ", " ")
            data["publishedAt"] = stringToTimeStamp(date_str)
        else:
            data["publishedAt"] = None


        data["reference"] =  safe_text(datas.get("Réf.")[0],[])

    return data


In [32]:
import json

errors_csv = "data/results_errors.csv"

# Charger les liens déjà traités depuis results.csv et results_errors.csv
processed_links = set()
results_count = 0  # Nombre réel de lignes dans results.csv

if os.path.exists(output_csv):
    with open(output_csv, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            if row:
                processed_links.add(row[0].strip())
                results_count += 1

# Charger aussi les liens déjà en erreur pour ne pas les retraiter en boucle
errors_count = 0
if os.path.exists(errors_csv):
    with open(errors_csv, "r", encoding="utf-8") as f:
        reader = csv.reader(f)
        for row in reader:
            if row:
                processed_links.add(row[0].strip())
                errors_count += 1

# Charger tous les liens à scraper (on enlève aussi les doublons tout en gardant l'ordre)
if os.path.exists(input_csv):
    with open(input_csv, "r", encoding="utf-8") as f:
        all_links = [line.strip() for line in f if line.strip()]
        all_links = list(dict.fromkeys(all_links))  # supprime les doublons
else:
    print("Fichier CSV d'entrée non trouvé.")
    all_links = []

# Filtrer les liens non traités
unprocessed_links = [link for link in all_links if link not in processed_links]

print(f"Trouvé {results_count} lignes dans results.csv")
print(f"Trouvé {errors_count} erreurs dans results_errors.csv")
print(f"Total: {len(processed_links)} liens uniques déjà traités (succès + erreurs).")
print(f"Reprise avec {len(unprocessed_links)} liens restants à traiter.")

# Ouvrir results CSV en mode ajout
with open(output_csv, "a", encoding="utf-8", newline="") as out_file, \
     open(errors_csv, "a", encoding="utf-8", newline="") as err_file:
    
    writer = csv.writer(out_file)
    err_writer = csv.writer(err_file)

    # Le compteur commence au nombre réel de lignes dans results.csv
    processed_count = results_count

    for link in unprocessed_links:
        processed_count += 1

        try:
            print(f"Traitement: {link}  ({processed_count}/{len(all_links)})")
            response = requests.get(link, headers=headers)
            if response.status_code != 200:
                print(f"Erreur lors du chargement de la page: {response.status_code}")
                # On enregistre l'erreur une seule fois pour ne pas la retraiter
                err_writer.writerow([link, response.status_code, "http_error"])
                err_file.flush()
                processed_links.add(link)
                continue

            content = response.text
            data = scrape_annonce(link, content)

            # Sérialiser les données scrapées en JSON pour éviter les problèmes de séparation de colonnes CSV
            writer.writerow([link, json.dumps(data, ensure_ascii=False)])
            out_file.flush()

            # Marquer comme traité
            processed_links.add(link)

            sleep(2)

        except Exception as e:
            print(f"Erreur lors du traitement {link}: {e}")
            # On log aussi les erreurs Python pour ne pas boucler dessus
            err_writer.writerow([link, "exception", str(e)])
            err_file.flush()
            processed_links.add(link)


Trouvé 1338 lignes dans results.csv
Trouvé 1 erreurs dans results_errors.csv
Total: 1333 liens uniques déjà traités (succès + erreurs).
Reprise avec 14387 liens restants à traiter.
Traitement: https://www.paruvendu.fr/immobilier/vente/appartement/1265798356A1KIVHAP000  (1339/14926)
Traitement: https://www.paruvendu.fr/immobilier/vente/appartement/1286219307A1KIVHAP000  (1340/14926)
Traitement: https://www.paruvendu.fr/immobilier/vente/appartement/1284377599A1KIVHAP000  (1341/14926)
Traitement: https://www.paruvendu.fr/immobilier/vente/appartement/1286524608A1KIVHAP000  (1342/14926)
Traitement: https://www.paruvendu.fr/immobilier/vente/maison/1283970845A1KIVHMN000  (1343/14926)
Traitement: https://www.paruvendu.fr/immobilier/prestige/maison/1286328857A1KIVHMN000  (1344/14926)
Traitement: https://www.paruvendu.fr/immobilier/vente/appartement/1286259065A1KIVHAP000  (1345/14926)
Traitement: https://www.paruvendu.fr/immobilier/vente/appartement/1285652231A1KIVHAP000  (1346/14926)


KeyboardInterrupt: 