In [17]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import cloudscraper
import sys

In [18]:
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
from selenium.common.exceptions import TimeoutException, WebDriverException

In [19]:
def setup_headless_driver():
    """Setup Chrome driver in headless mode for scraping"""
    chrome_options = Options()
    
    # Headless mode - no browser popup
    chrome_options.add_argument('--headless')
    
    # User agent and language settings
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36')
    chrome_options.add_argument('--accept-language=es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3')
    
    # Anti-detection settings
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # Performance settings
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    
    # Proxy settings (uncomment if needed)
    # chrome_options.add_argument('--proxy-server=http://42.118.0.24:16000')
    
    driver = webdriver.Chrome(options=chrome_options)
    
    # Hide webdriver property
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver

In [20]:
def scrape_page(driver, url, wait_time=3):
    """
    Scrape a single page and return BeautifulSoup object
    
    Args:
        driver: Selenium WebDriver instance
        url: URL to scrape
        wait_time: Time to wait for page load (seconds)
    
    Returns:
        BeautifulSoup object or None if failed
    """
    try:
        # Navigate to the page
        driver.get(url)
        
        # Wait for page to load
        time.sleep(wait_time)
        
        # Wait for body element to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        
        # Get page source
        page_source = driver.page_source
        
        # Create BeautifulSoup object
        soup = BeautifulSoup(page_source, 'html.parser')
        
        return soup
        
    except TimeoutException:
        print(f"✗ Timeout while loading: {url}")
        return None
    except WebDriverException as e:
        print(f"✗ WebDriver error for {url}: {str(e)}")
        return None
    except Exception as e:
        print(f"✗ Unexpected error for {url}: {str(e)}")
        return None

In [36]:
def save_html_to_file(content, section, page_num, output_dir="film_reviews"):
    """
    Save HTML content to a file, handling different content types and encodings
    
    Args:
        content: HTML content (string) or response object
        section: Section identifier
        page_num: Page number
        output_dir: Directory to save files
    
    Returns:
        File path where HTML was saved
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create filename
    filename = f"{section}_{page_num}.html"
    file_path = os.path.join(output_dir, filename)
    
    try:
        # If content is a response object, get the text with proper encoding
        if hasattr(content, 'text'):
            html_content = content.text
        elif hasattr(content, 'content'):
            # Try to decode content with the response's encoding or utf-8
            encoding = content.encoding if hasattr(content, 'encoding') else 'utf-8'
            html_content = content.content.decode(encoding)
        else:
            # If it's already a string
            html_content = str(content)
        
        # Save HTML content
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        print(f"💾 Saved HTML to: {file_path}")
        return file_path
        
    except Exception as e:
        print(f"❌ Error saving file: {str(e)}")
        return None

In [None]:
def scrape_with_cloudscraper(url):
    scraper = cloudscraper.create_scraper(
        browser={
            'browser': 'chrome',
            'platform': 'windows',
            'mobile': False
        },
        delay=10  # 10 seconds delay between requests
    )

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept-Language': 'es-ES,es;q=0.9,en;q=0.3',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
    }
    
    # Add headers to the session
    scraper.headers.update(headers)
    
    # Make the request with timeout
    response = scraper.get(url, 15)  # 10 seconds timeout for the request

    return response

In [31]:
def scrape_filmaffinity_sections_and_pages(film_links, base_url_template, save_html=True):
    """
    Scrape multiple sections and pages from FilmAffinity
    
    Args:
        sections: List of section identifiers
        pages_per_section: Number of pages to scrape per section
        base_url_template: URL template with placeholders for section and page
        save_html: Whether to save HTML files to disk
    
    Returns:
        Dictionary with scraped data organized by section and page
    """
    
    
    # driver = setup_headless_driver()
    
    list_end_reviews = open('film_reviews/end_reviews', 'r').read().split('\n')

    try:
        # Outer loop: iterate through sections
        for link in (pbar:=tqdm(list(film_links))):

            film_id = str(link).split('/')[-1].split('.')[0]
            review_page_id = 1

            pbar.set_postfix_str(film_id)

            if film_id in list_end_reviews:
                continue

            # Inner loop: iterate through pages for each section
            while True:
                
                if not os.path.exists(os.path.join(os.getcwd(), "film_reviews", film_id + "_" + str(review_page_id) + ".html")):
                    
                    # Build URL for current section and page
                    url = base_url_template.format(section=film_id.removeprefix("film"), page=review_page_id)

                    # Scrape the page
                    # soup = scrape_page(driver, url)
                    response = scrape_with_cloudscraper(url)

                    soup = BeautifulSoup(response.text)

                    # Error 404, no more reviews for this film
                    if response.status_code == 404:
                        with open("film_reviews/end_reviews", 'a') as file_end:
                            file_end.write(f'{film_id}\n')

                        break


                    # Optionally save HTML to file
                    if save_html:
                        save_html_to_file(response.text, film_id, review_page_id)


                else:
                    with open(f'film_reviews/{film_id}_{review_page_id}.html', 'r', encoding="utf-8") as file:
                        response = file.read()
                    soup = BeautifulSoup(response)


                h1_s = soup.find_all("h1")

                too_many_requests = False
                for h1 in h1_s:
                    # If too many request get a proxy and continue requesting films
                    if h1.get_text() == "Too many requests":
                        too_many_requests = True
                        print(f"INFO: Too many requests\n")
                

                # Delete the 4KB response
                if too_many_requests:
                    os.remove(f'film_reviews/{film_id}_{review_page_id}.html')
                    sys.exit()
                    continue

                review_page_id += 1                    


    except Exception as e:
        print(f"❌ Error during scraping: {str(e)}")
        
    finally:
        # Always close the driver
        print("🔚 Closing browser...")
        # driver.quit()
    

In [24]:
# Example usage - customize these parameters for your needs

# Define the sections you want to scrape (user IDs or section identifiers)
film_links = ["https://www.filmaffinity.com/es/" + file_name for file_name in os.listdir('htmls_film_info')]
print(f"Number of films: {len(film_links)}")

# URL template - modify this based on the actual FilmAffinity URL structure
# Use {section} and {page} as placeholders
# base_url_template = "https://www.filmaffinity.com/es/userratings.php?user_id={section}&p={page}&orderby=rating-date&chv=grid"
base_url_template = "https://www.filmaffinity.com/es/reviews/{page}/{section}.html"

# Alternative URL templates you might need:
# base_url_template = "https://www.filmaffinity.com/es/films.php?section={section}&p={page}"
# base_url_template = "https://www.filmaffinity.com/es/search.php?category={section}&p={page}"


Number of films: 24367


In [39]:
# Run the scraping process
print("🚀 Starting the scraping process...")
print("=" * 50)

scraped_data = scrape_filmaffinity_sections_and_pages(
    film_links=film_links,
    base_url_template=base_url_template,
    save_html=True
)

🚀 Starting the scraping process...


 52%|█████▏    | 12770/24367 [00:26<00:24, 479.35it/s, film576352] 

❌ Error during scraping: ('Connection aborted.', ConnectionResetError(10054, 'Se ha forzado la interrupción de una conexión existente por el host remoto', None, 10054, None))
🔚 Closing browser...



