In [None]:
import cloudscraper
import sys
from bs4 import BeautifulSoup
import os
from tqdm import tqdm

In [5]:
def save_html_to_file(content, section, page_num, output_dir="film_reviews"):
    """
    Save HTML content to a file, handling different content types and encodings
    
    Args:
        content: HTML content (string) or response object
        section: Section identifier
        page_num: Page number
        output_dir: Directory to save files
    
    Returns:
        File path where HTML was saved
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create filename
    filename = f"{section}_{page_num}.html"
    file_path = os.path.join(output_dir, filename)
    
    try:
        # If content is a response object, get the text with proper encoding
        if hasattr(content, 'text'):
            html_content = content.text
        elif hasattr(content, 'content'):
            # Try to decode content with the response's encoding or utf-8
            encoding = content.encoding if hasattr(content, 'encoding') else 'utf-8'
            html_content = content.content.decode(encoding)
        else:
            # If it's already a string
            html_content = str(content)
        
        # Save HTML content
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        print(f"💾 Saved HTML to: {file_path}")
        return file_path
        
    except Exception as e:
        print(f"❌ Error saving file: {str(e)}")
        return None

In [6]:
def scrape_with_cloudscraper(url):
    scraper = cloudscraper.create_scraper(
        browser={
            'browser': 'chrome',
            'platform': 'windows',
            'mobile': False
        },
        delay=10  # 10 seconds delay between requests
    )

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept-Language': 'es-ES,es;q=0.9,en;q=0.3',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
    }
    
    # Add headers to the session
    scraper.headers.update(headers)
    
    # Make the request with timeout
    response = scraper.get(url, timeout=15)  # 10 seconds timeout for the request

    return response

In [None]:
def scrape_filmaffinity_sections_and_pages(film_links, base_url_template, save_html=True):
    """
    Scrape multiple sections and pages from FilmAffinity
    
    Args:
        sections: List of section identifiers
        pages_per_section: Number of pages to scrape per section
        base_url_template: URL template with placeholders for section and page
        save_html: Whether to save HTML files to disk
    
    Returns:
        Dictionary with scraped data organized by section and page
    """
    
    
    list_end_reviews = open('film_reviews/end_reviews', 'r').read().split('\n')

    try:
        # Outer loop: iterate through sections
        for link in (pbar:=tqdm(list(film_links))):

            film_id = str(link).split('/')[-1].split('.')[0]
            review_page_id = 1

            pbar.set_postfix_str(film_id)

            if film_id in list_end_reviews:
                continue

            # Inner loop: iterate through pages for each section
            while True:
                
                if not os.path.exists(os.path.join(os.getcwd(), "film_reviews", film_id + "_" + str(review_page_id) + ".html")):
                    
                    # Build URL for current section and page
                    url = base_url_template.format(section=film_id.removeprefix("film"), page=review_page_id)

                    # Scrape the page
                    # soup = scrape_page(driver, url)
                    response = scrape_with_cloudscraper(url)

                    soup = BeautifulSoup(response.text)

                    # Error 404, no more reviews for this film
                    if response.status_code == 404:
                        with open("film_reviews/end_reviews", 'a') as file_end:
                            file_end.write(f'{film_id}\n')

                        break


                    # Optionally save HTML to file
                    if save_html:
                        save_html_to_file(response.text, film_id, review_page_id)


                else:
                    with open(f'film_reviews/{film_id}_{review_page_id}.html', 'r', encoding="utf-8") as file:
                        response = file.read()
                    soup = BeautifulSoup(response)


                h1_s = soup.find_all("h1")

                too_many_requests = False
                for h1 in h1_s:
                    # If too many request get a proxy and continue requesting films
                    if h1.get_text() == "Too many requests":
                        too_many_requests = True
                        print(f"INFO: Too many requests\n")
                

                # Delete the 4KB response
                if too_many_requests:
                    os.remove(f'film_reviews/{film_id}_{review_page_id}.html')
                    sys.exit()
                    continue

                review_page_id += 1                    


    except Exception as e:
        print(f"❌ Error during scraping: {str(e)}")
        
    finally:
        # Always close the driver
        print("🔚 Closing browser...")


In [8]:
# Example usage - customize these parameters for your needs

# Define the sections you want to scrape (user IDs or section identifiers)
film_links = ["https://www.filmaffinity.com/es/" + file_name for file_name in os.listdir('htmls_film_info')]
print(f"Number of films: {len(film_links)}")

# URL template - modify this based on the actual FilmAffinity URL structure
# Use {section} and {page} as placeholders
# base_url_template = "https://www.filmaffinity.com/es/userratings.php?user_id={section}&p={page}&orderby=rating-date&chv=grid"
base_url_template = "https://www.filmaffinity.com/es/reviews/{page}/{section}.html"

# Alternative URL templates you might need:
# base_url_template = "https://www.filmaffinity.com/es/films.php?section={section}&p={page}"
# base_url_template = "https://www.filmaffinity.com/es/search.php?category={section}&p={page}"


Number of films: 24367


In [11]:
# Run the scraping process
print("🚀 Starting the scraping process...")
print("=" * 50)

scraped_data = scrape_filmaffinity_sections_and_pages(
    film_links=film_links,
    base_url_template=base_url_template,
    save_html=True
)

🚀 Starting the scraping process...


 55%|█████▍    | 13351/24367 [00:06<00:06, 1735.62it/s, film598949]

💾 Saved HTML to: film_reviews\film598949_22.html
💾 Saved HTML to: film_reviews\film598949_23.html
💾 Saved HTML to: film_reviews\film598949_24.html


 55%|█████▍    | 13351/24367 [00:11<00:06, 1735.62it/s, film598973]

💾 Saved HTML to: film_reviews\film598973_1.html


 55%|█████▍    | 13351/24367 [00:12<00:06, 1735.62it/s, film598983]

💾 Saved HTML to: film_reviews\film598983_1.html


 55%|█████▍    | 13351/24367 [00:13<00:06, 1735.62it/s, film599071]

💾 Saved HTML to: film_reviews\film599071_1.html


 55%|█████▍    | 13351/24367 [00:14<00:06, 1735.62it/s, film599098]

💾 Saved HTML to: film_reviews\film599098_1.html
💾 Saved HTML to: film_reviews\film599098_2.html
💾 Saved HTML to: film_reviews\film599098_3.html
💾 Saved HTML to: film_reviews\film599098_4.html
💾 Saved HTML to: film_reviews\film599098_5.html


 55%|█████▍    | 13351/24367 [00:18<00:06, 1735.62it/s, film599098]

💾 Saved HTML to: film_reviews\film599098_6.html
💾 Saved HTML to: film_reviews\film599098_7.html
💾 Saved HTML to: film_reviews\film599098_8.html
💾 Saved HTML to: film_reviews\film599098_9.html
💾 Saved HTML to: film_reviews\film599098_10.html
💾 Saved HTML to: film_reviews\film599098_11.html
💾 Saved HTML to: film_reviews\film599098_12.html
💾 Saved HTML to: film_reviews\film599098_13.html
💾 Saved HTML to: film_reviews\film599098_14.html
💾 Saved HTML to: film_reviews\film599098_15.html
💾 Saved HTML to: film_reviews\film599098_16.html


 55%|█████▌    | 13412/24367 [00:25<07:12, 25.33it/s, film599100]  

💾 Saved HTML to: film_reviews\film599100_1.html


 55%|█████▌    | 13413/24367 [00:26<07:43, 23.66it/s, film599168]

💾 Saved HTML to: film_reviews\film599168_1.html


 55%|█████▌    | 13413/24367 [00:27<07:43, 23.66it/s, film599174]

💾 Saved HTML to: film_reviews\film599174_1.html
💾 Saved HTML to: film_reviews\film599174_2.html


 55%|█████▌    | 13413/24367 [00:28<07:43, 23.66it/s, film599209]

💾 Saved HTML to: film_reviews\film599209_1.html
💾 Saved HTML to: film_reviews\film599209_2.html
💾 Saved HTML to: film_reviews\film599209_3.html
💾 Saved HTML to: film_reviews\film599209_4.html
💾 Saved HTML to: film_reviews\film599209_5.html


 55%|█████▌    | 13413/24367 [00:31<07:43, 23.66it/s, film599242]

💾 Saved HTML to: film_reviews\film599242_1.html


 55%|█████▌    | 13413/24367 [00:32<07:43, 23.66it/s, film599287]

💾 Saved HTML to: film_reviews\film599287_1.html
💾 Saved HTML to: film_reviews\film599287_2.html


 55%|█████▌    | 13413/24367 [00:34<07:43, 23.66it/s, film599359]

💾 Saved HTML to: film_reviews\film599359_1.html


 55%|█████▌    | 13413/24367 [00:35<07:43, 23.66it/s, film599360]

💾 Saved HTML to: film_reviews\film599360_1.html
💾 Saved HTML to: film_reviews\film599360_2.html
💾 Saved HTML to: film_reviews\film599360_3.html
💾 Saved HTML to: film_reviews\film599360_4.html


 55%|█████▌    | 13413/24367 [00:38<07:43, 23.66it/s, film599360]

💾 Saved HTML to: film_reviews\film599360_5.html
💾 Saved HTML to: film_reviews\film599360_6.html
💾 Saved HTML to: film_reviews\film599360_7.html
💾 Saved HTML to: film_reviews\film599360_8.html
💾 Saved HTML to: film_reviews\film599360_9.html
💾 Saved HTML to: film_reviews\film599360_10.html
💾 Saved HTML to: film_reviews\film599360_11.html
💾 Saved HTML to: film_reviews\film599360_12.html
💾 Saved HTML to: film_reviews\film599360_13.html
💾 Saved HTML to: film_reviews\film599360_14.html
💾 Saved HTML to: film_reviews\film599360_15.html
💾 Saved HTML to: film_reviews\film599360_16.html
💾 Saved HTML to: film_reviews\film599360_17.html
💾 Saved HTML to: film_reviews\film599360_18.html
💾 Saved HTML to: film_reviews\film599360_19.html
💾 Saved HTML to: film_reviews\film599360_20.html
💾 Saved HTML to: film_reviews\film599360_21.html
💾 Saved HTML to: film_reviews\film599360_22.html
💾 Saved HTML to: film_reviews\film599360_23.html
💾 Saved HTML to: film_reviews\film599360_24.html
💾 Saved HTML to: film_rev

 55%|█████▌    | 13420/24367 [00:52<26:36,  6.86it/s, film599408]

💾 Saved HTML to: film_reviews\film599408_1.html
💾 Saved HTML to: film_reviews\film599408_2.html
💾 Saved HTML to: film_reviews\film599408_3.html
💾 Saved HTML to: film_reviews\film599408_4.html
💾 Saved HTML to: film_reviews\film599408_5.html
💾 Saved HTML to: film_reviews\film599408_6.html
💾 Saved HTML to: film_reviews\film599408_7.html
💾 Saved HTML to: film_reviews\film599408_8.html
💾 Saved HTML to: film_reviews\film599408_9.html
💾 Saved HTML to: film_reviews\film599408_10.html
💾 Saved HTML to: film_reviews\film599408_11.html


 55%|█████▌    | 13420/24367 [00:59<00:48, 225.00it/s, film599408]

💾 Saved HTML to: film_reviews\film599408_12.html
INFO: Too many requests

🔚 Closing browser...





SystemExit: 