In [None]:
!pip install selenium


In [None]:
import os
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import logging
import re

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("imdb_scraper.log"),
        logging.StreamHandler()
    ]
)

# Constants
MOVIES_CSV = 'indian_movies_ids.csv'
TV_SHOWS_CSV = 'indian_tv_shows_ids.csv'
DELAY_MIN = 5  
DELAY_MAX = 10 
PAGE_LOAD_TIMEOUT = 30  
LOAD_MORE_TIMEOUT = 15  

def setup_driver():
    """Set up and return a configured Chrome WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    # Additional options to improve stability
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)

    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(PAGE_LOAD_TIMEOUT)
    return driver

def extract_imdb_id(href):
    """Safely extract IMDb ID from href."""
    if not href:
        return None

    # Method 1: Standard format "/title/tt0111161/"
    match = re.search(r'/title/(tt\d+)/', href)
    if match:
        return match.group(1)

    # Method 2: Alternative format with query params "/title/tt0111161?ref_=..."
    match = re.search(r'/title/(tt\d+)', href)
    if match:
        return match.group(1)

    # Method 3: Look for any tt followed by numbers
    match = re.search(r'(tt\d+)', href)
    if match:
        return match.group(1)

    return None

def extract_imdb_ids(driver):
    """Extract IMDb IDs from the currently loaded page."""
    imdb_ids = []
    processed = set()  # Track processed IDs to avoid duplicates

    try:
        # Try different methods to find items with IMDb IDs
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/title/']")

        for link in links:
            try:
                href = link.get_attribute("href")
                imdb_id = extract_imdb_id(href)

                if imdb_id and imdb_id not in processed:
                    imdb_ids.append(imdb_id)
                    processed.add(imdb_id)

            except Exception as e:
                logging.error(f"Error processing link: {e}")

        return imdb_ids
    except Exception as e:
        logging.error(f"Error extracting IMDb IDs: {e}")
        return []

def get_current_ids_count(driver):
    """Get the current number of unique IMDb IDs on the page."""
    unique_ids = set()

    try:
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/title/']")
        for link in links:
            try:
                href = link.get_attribute("href")
                imdb_id = extract_imdb_id(href)
                if imdb_id:
                    unique_ids.add(imdb_id)
            except:
                continue

        return len(unique_ids)
    except Exception as e:
        logging.error(f"Error counting IDs: {e}")
        return 0

def scrape_content(driver, content_type):
    """
    Scrape IMDb IDs for the given content type (movie or tv).
    Continues until no more content can be loaded.
    """
    if content_type == 'movie':
        csv_file = MOVIES_CSV
        url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2023-09-01,&num_votes=49999,&sort=num_votes,desc,&view=simple'
    else:  
        csv_file = TV_SHOWS_CSV
        url = 'https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&release_date=2018-01-01,&num_votes=8000,&sort=num_votes,desc,&view=simple'

    # Load or create dataframe for tracking progress
    if os.path.exists(csv_file):
        df = pd.read_csv(csv_file)
        logging.info(f"Found existing file {csv_file} with {len(df)} entries")
    else:
        df = pd.DataFrame(columns=['imdb_id'])
        logging.info(f"Starting new scrape for {content_type}")

    # Start the scraping process
    try:
        driver.get(url)
        WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".lister-list, body"))
        )

        last_count = 0
        no_new_content_count = 0
        load_more_attempts = 0

        while True:
            # Extract IMDb IDs from current page
            logging.info(f"Extracting IMDb IDs from current page view")
            new_imdb_ids = extract_imdb_ids(driver)

            if new_imdb_ids:
                # Create a set of IDs we already have
                existing_ids = set(df['imdb_id']) if not df.empty else set()
                new_ids = [id for id in new_imdb_ids if id not in existing_ids]

                if new_ids:
                    new_df = pd.DataFrame({'imdb_id': new_ids})
                    df = pd.concat([df, new_df], ignore_index=True)
                    df = df.drop_duplicates(subset=['imdb_id'])
                    df.to_csv(csv_file, index=False)
                    logging.info(f"Saved {len(df)} entries to {csv_file} (added {len(new_ids)} new IDs)")
                else:
                    logging.info("No new IMDb IDs found in this batch")
            else:
                logging.warning(f"No IMDb IDs found in current view")

            # Get current count of IDs on page
            current_count = get_current_ids_count(driver)
            logging.info(f"Current unique IDs on page: {current_count}")

            # Check if we've reached the end
            if current_count == last_count:
                no_new_content_count += 1
                if no_new_content_count >= 3:
                    logging.info("No new content after multiple attempts, reached the end")
                    break
            else:
                no_new_content_count = 0
                last_count = current_count

            # Try to load more content
            load_more_attempts += 1
            try:
                # First, specifically look for the button structure from the HTML provided
                try:
                    # Look for the specific "50 more" button by its classes and text
                    load_more_button = WebDriverWait(driver, LOAD_MORE_TIMEOUT).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, "button.ipc-btn.ipc-see-more__button"))
                    )
                    # Find the button containing the text "50 more"
                    buttons = driver.find_elements(By.CSS_SELECTOR, "button.ipc-btn.ipc-see-more__button")
                    for button in buttons:
                        if "50 more" in button.text:
                            driver.execute_script("arguments[0].scrollIntoView(true);", button)
                            time.sleep(1)  # Give time for scroll to complete
                            driver.execute_script("arguments[0].click();", button)
                            logging.info("Clicked '50 more' button")
                            time.sleep(5)  # Wait for content to load
                            load_more_found = True
                            break
                except Exception as e:
                    logging.warning(f"Could not find the specific '50 more' button: {e}")
                    load_more_found = False

                # If the specific button wasn't found, try alternative selectors
                if not load_more_found:
                    load_more_selectors = [
                        "span.ipc-see-more button",
                        ".ipc-see-more__button",
                        ".single-page-see-more-button button",
                        "button.ipc-btn--single-padding",
                        "button:contains('50 more')",
                        "button.ipc-see-more__button"
                    ]

                    for selector in load_more_selectors:
                        try:
                            load_more = WebDriverWait(driver, LOAD_MORE_TIMEOUT).until(
                                EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                            )
                            driver.execute_script("arguments[0].scrollIntoView(true);", load_more)
                            time.sleep(1)  # Give time for scroll to complete
                            driver.execute_script("arguments[0].click();", load_more)
                            load_more_found = True
                            logging.info(f"Clicked 'Load more' button using selector: {selector}")

                            # Wait for new content to load
                            time.sleep(5)
                            break
                        except Exception as e:
                            continue

                if not load_more_found:
                    logging.info("Could not find 'Load more' button, trying JavaScript scroll")
                    # Try scrolling to bottom to trigger lazy loading
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(5)

                    # Check if more content loaded after scroll
                    pre_scroll_count = current_count
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(5)
                    post_scroll_count = get_current_ids_count(driver)

                    if post_scroll_count <= pre_scroll_count:
                        logging.info("No more content loaded after scrolling, attempting one more aggressive scroll")
                        # Try one more aggressive scroll
                        for _ in range(5):
                            driver.execute_script("window.scrollBy(0, 1000);")
                            time.sleep(1)

                        final_count = get_current_ids_count(driver)
                        if final_count <= post_scroll_count:
                            logging.info("No more content loaded after aggressive scrolling, stopping")
                            break
            except Exception as e:
                logging.error(f"Error attempting to load more content: {e}")
                if no_new_content_count >= 2:
                    logging.info("Multiple failures to load new content, stopping")
                    break
            # Random delay between attempts
            delay = random.uniform(DELAY_MIN, DELAY_MAX)
            logging.info(f"Waiting {delay:.2f} seconds before next extraction")
            time.sleep(delay)

    except Exception as e:
        logging.error(f"Error during content scraping: {e}")

    return len(df)

def main():
    """Main function to run the scraper."""
    logging.info("Starting IMDb scraper for Indian content")

    try:
        # Set up the WebDriver
        driver = setup_driver()

        try:
            # Scrape movies
            logging.info("Starting to scrape Indian movies")
            movies_scraped = scrape_content(driver, 'movie')

            # # Scrape TV shows
            # logging.info("Starting to scrape Indian TV shows")
            # tv_scraped = scrape_content(driver, 'tv')

            # Final summary
            logging.info(f"Completed scraping:")
            logging.info(f"Movies: {movies_scraped} unique IMDb IDs")
            # logging.info(f"TV Shows: {tv_scraped} unique IMDb IDs")
                #
        finally:
            # Always close the driver when done
            driver.quit()

    except Exception as e:
        logging.error(f"Unhandled exception in main: {e}")

if __name__ == "__main__":
    main()

In [6]:

def compare_and_add_unique_ids(csv1_path, csv2_path, output_csv_path):
    """
    Compares two CSV files based on 'imdb_id', finds unique IDs from the second CSV,
    and adds them to a new CSV.
    """

    # Read the CSV files into pandas DataFrames
    df1 = pd.read_csv(csv1_path)
    df2 = pd.read_csv(csv2_path)

    # Extract 'imdb_id' columns as sets for efficient comparison
    ids1 = set(df1['imdb_id'])
    ids2 = set(df2['imdb_id'])

    # Find unique IDs in the second CSV that are not in the first
    unique_ids = ids2 - ids1

    # Create a new DataFrame with the unique IDs
    unique_ids_df = pd.DataFrame({'imdb_id': list(unique_ids)})

    # Save the unique IDs to a new CSV file
    unique_ids_df.to_csv(output_csv_path, index=False)


# Example usage
compare_and_add_unique_ids('/content/tv_datasets.csv', '/content/indian_tv_shows_ids.csv', 'uniqu_ids.csv')


In [None]:
import csv
import time
import random
import re
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Constants
BASE_URL = "https://www.imdb.com"
MAX_RETRIES = 3
DELAY_BETWEEN_REQUESTS = (2, 5)  
COLUMNS = [
    "imdb_id", "title", "vote_average", "vote_count", "release_date",
    "original_language", "overview", "popularity", "genres",
    "production_companies", "release_year", "cast",
    "budget", "collection", "directors"
]

def try_multiple_selectors(driver, selectors, attribute=None):
    """Try multiple CSS selectors and return the first matching elements"""
    for selector in selectors:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if elements:
                if attribute:
                    return [elem.get_attribute(attribute) for elem in elements if elem.get_attribute(attribute)]
                else:
                    return [elem for elem in elements if elem.text.strip()]
        except Exception as e:
            continue
    return []

def get_element_text(driver, selectors, first_only=True):
    """Try multiple selectors and return text of first or all matching elements"""
    elements = try_multiple_selectors(driver, selectors)
    if not elements:
        return "" if first_only else []

    if first_only:
        return elements[0].text.strip()
    else:
        return [elem.text.strip() for elem in elements]

def get_page_with_retry(driver, url, max_retries=MAX_RETRIES):
    """Load a page with retry mechanism"""
    for attempt in range(max_retries):
        try:
            driver.get(url)
            # Wait for page to load
            time.sleep(random.uniform(2, 4))
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            return True
        except Exception as e:
            print(f"Attempt {attempt+1} failed for {url}: {e}")
            time.sleep(random.uniform(5, 10))  # Longer wait after failure
    return False

def extract_numeric_value(text):
    """Extract numeric value from text with K, M suffixes"""
    if not text:
        return ""
    text = text.strip()
    if "K" in text:
        return str(int(float(text.replace("K", "").strip()) * 1000))
    elif "M" in text:
        return str(int(float(text.replace("M", "").strip()) * 1000000))
    return text.replace(",", "")

def clean_money_value(value):
    """Clean up monetary values"""
    if not value:
        return ""

    # Remove non-essential parts like estimated or specific phrases
    value = re.sub(r'\(.*?\)', '', value).strip()

    # Check if it's a valid monetary value
    if re.search(r'[₹$€£][\d,.]+|[\d,.]+\s*(?:[a-zA-Z]{3}|[₹$€£])', value):
        return value

    return ""

def get_title_details(imdb_id, driver):
    """Scrape details for a specific title"""
    details = {col: "" for col in COLUMNS}
    details["imdb_id"] = imdb_id
    title_url = f"{BASE_URL}/title/{imdb_id}/"

    success = get_page_with_retry(driver, title_url)
    if not success:
        return details

    # Extract basic details from the current page
    try:
        title_selectors = [
            'h1[data-testid="hero-title-block__title"]',
            'h1.TitleHeader__TitleText-sc-*',
            'h1.hero__title',
            'span[class*="hero-title-block__title"]'
        ]
        details["title"] = get_element_text(driver, title_selectors)

        # If title not found, look for it in page title
        if not details["title"]:
            try:
                page_title = driver.title
                title_match = re.match(r'(.+?)(?:\s*\([0-9]{4}\))?\s*-\s*IMDb', page_title)
                if title_match:
                    details["title"] = title_match.group(1).strip()
            except:
                pass

        # Rating (vote_average) - fix formatting issues
        rating_selectors = [
            'div[data-testid="hero-rating-bar__aggregate-rating__score"] span:first-child',
            'span[class*="RatingScore"]',
            'span.ipc-rating-star--imdb',
            'span.AggregateRatingButton__RatingScore-*'
        ]
        rating = get_element_text(driver, rating_selectors)

        # Clean up rating (remove "/10" if present)
        if rating:
            rating = rating.replace('/10', '').strip()
            # Make sure it's a valid number
            try:
                float(rating)
                details["vote_average"] = rating
            except:
                details["vote_average"] = ""

        # Vote count - fix empty vote counts
        vote_count_selectors = [
            'div[class*="AggregateRatingButton__TotalRatingAmount"]',
            'div.sc-d541859f-3',
            'div.imdbRating strong',
            'span.ipc-rating-star--imdb + div',
            'a[href*="ratings"] span'
        ]
        vote_count = get_element_text(driver, vote_count_selectors)
        details["vote_count"] = extract_numeric_value(vote_count)

        # If vote count still empty, try another approach
        if not details["vote_count"]:
            try:
                ratings_text = driver.find_element(By.XPATH, "//*[contains(text(), 'votes')]").text
                votes_match = re.search(r'([\d,.]+)\s*votes', ratings_text)
                if votes_match:
                    details["vote_count"] = votes_match.group(1).replace(',', '')
            except:
                pass

        # Release date
        release_date_selectors = [
            'li[data-testid="title-details-releasedate"] .ipc-metadata-list-item__list-content-item',
            'a[href*="releaseinfo"]',
            'span[class*="TitleBlockMetaData__ListItemText"]'
        ]
        release_date = get_element_text(driver, release_date_selectors)
        details["release_date"] = release_date

        # Release year
        if release_date:
            year_match = re.search(r'(\d{4})', release_date)
            if year_match:
                details["release_year"] = year_match.group(1)

        # If year not found in release date, try other selectors
        if not details["release_year"]:
            year_selectors = [
                'span.TitleBlockMetaData__ListItemText-*',
                'a[href*="releaseinfo"]',
                'span.TitleBlockMetaData__ReleaseYear-*',
                'a.ipc-link[href*="releaseinfo"]'
            ]
            year_text = get_element_text(driver, year_selectors)
            if year_text:
                year_match = re.search(r'(\d{4})', year_text)
                if year_match:
                    details["release_year"] = year_match.group(1)

        # If still not found, check page title
        if not details["release_year"]:
            try:
                page_title = driver.title
                year_match = re.search(r'\((\d{4})\)', page_title)
                if year_match:
                    details["release_year"] = year_match.group(1)
            except:
                pass

        # Overview/Plot
        try:
            plot_elem = driver.find_element(By.CSS_SELECTOR, 'span[class*="GenresAndPlot__TextContainerBreakpointXL"]')
            details["overview"] = plot_elem.text.strip()
        except NoSuchElementException:
            # Try alternative selectors
            try:
                plot_elem = driver.find_element(By.CSS_SELECTOR, 'p[data-testid="plot"]')
                details["overview"] = plot_elem.text.strip()
            except:
                pass

        # Genres
        genre_selectors = [
            'div[data-testid="genres"] span.ipc-chip__text',
            'a.GenresAndPlot__GenreChip-*',
            '.ipc-chip-list--baseAlt span.ipc-chip__text',
            'a[href*="genres="] span.ipc-chip__text'
        ]

        genres = []
        for selector in genre_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    genres = [elem.text.strip() for elem in elements if elem.text.strip() and not "more" in elem.text.lower()]
                    if genres:
                        break
            except:
                continue

        details["genres"] = ", ".join(genres)

        # Language
        lang_selectors = [
            'li[data-testid="title-details-languages"] .ipc-metadata-list-item__list-content-item',
            'a[href*="primary_language"]'
        ]
        details["original_language"] = get_element_text(driver, lang_selectors)

        # Production companies
        company_selectors = [
            'li[data-testid="title-details-companies"] .ipc-metadata-list-item__list-content-item',
            'a[href*="company"]'
        ]

        companies = []
        for selector in company_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    companies = [elem.text.strip() for elem in elements if elem.text.strip()]
                    if companies:
                        break
            except:
                continue

        details["production_companies"] = ", ".join(companies)

        # Cast
        cast_selectors = [
            'a[data-testid="title-cast-item__actor"]',
            'td.primary_photo + td a',
            'a[data-testid="title-cast-item__actor"]'
        ]

        cast = []
        for selector in cast_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                if elements:
                    cast = [elem.text.strip() for elem in elements[:10] if elem.text.strip()]
                    if cast:
                        break
            except:
                continue

        details["cast"] = ", ".join(cast)


        def clean_money_value(value):
          if not value:
              return ""

          value = value.strip()
          value = re.sub(r'\(.*?\)', '', value).strip()
          multiplier = 1
          currency = ""

          # Check and extract currency
          if value.startswith('$'):
              currency = 'USD'
              value = value.replace('$', '')
              multiplier = 83 
          elif value.startswith('₹'):
              currency = 'INR'
              value = value.replace('₹', '')
              multiplier = 1
          elif value.startswith('€') or value.startswith('£'):
              return value 

          value = value.replace(',', '')

          try:
              num = float(re.findall(r'\d+(?:\.\d+)?', value)[0])
          except (IndexError, ValueError):
              return ""

          num_in_inr = int(num * multiplier)

          return f"₹{num_in_inr:,}"

        # Budget - from box office section
        try:
            budget_element = driver.find_element(By.CSS_SELECTOR, 'li[data-testid="title-boxoffice-budget"] .ipc-metadata-list-item__list-content-item')
            budget = budget_element.text.strip()
        except:
            budget = ""

        details["budget"] = clean_money_value(budget)


        # Worldwide gross (collection)
        try:
            collection_element = driver.find_element(By.CSS_SELECTOR, 'li[data-testid="title-boxoffice-cumulativeworldwidegross"] .ipc-metadata-list-item__list-content-item')
            collection = collection_element.text.strip()
        except:
            collection = ""

        details["collection"] = clean_money_value(collection)


        # Extract Director(s)
        try:
            elements = driver.find_elements(By.XPATH, '//li[.//span[text()="Director"]]//a[contains(@href, "/name/")]')
            seen = set()
            directors = []
            for elem in elements:
                name = elem.text.strip()
                if name and name not in seen:
                    seen.add(name)
                    directors.append(name)
        except:
            directors = []

        details["directors"] = ", ".join(directors)



        # Calculate popularity
        try:
            vote_avg = float(details["vote_average"])
            vote_count = int(details["vote_count"].replace(",", ""))
            details["popularity"] = str(round(vote_avg * (vote_count / 1000), 2))
        except:
            details["popularity"] = ""

    except Exception as e:
        print(f"Error extracting details for {imdb_id}: {e}")


    # Sleep to avoid overloading the server
    time.sleep(random.uniform(2, 4))

    return details

def get_completed_ids(output_file):
    """Get list of imdb_ids that have already been processed"""
    if not os.path.exists(output_file):
        return set()

    try:
        df = pd.read_csv(output_file)
        return set(df['imdb_id'].tolist())
    except:
        return set()

def main(input_file, output_file):
    """Main function to run the scraper"""

    !apt-get update
    !apt install -y chromium-browser
    !apt-get install -y chromium-driver


    try:
        df_input = pd.read_csv(input_file)
        print(f"Loaded input file with {len(df_input)} entries")
    except Exception as e:
        print(f"Error reading input file: {e}")
        raise

    # Check if output file exists, and get already completed IDs
    completed_ids = get_completed_ids(output_file)
    print(f"Found {len(completed_ids)} already processed entries")

    if os.path.exists(output_file):
        df_output = pd.read_csv(output_file)
    else:
        df_output = pd.DataFrame(columns=COLUMNS)

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    chrome_options.add_argument('--disable-extensions')

    # Create a new Chrome driver
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(30)

    try:
        for index, row in df_input.iterrows():
            imdb_id = row['imdb_id']

            if imdb_id in completed_ids:
                print(f"Skipping {imdb_id} - already processed")
                continue

            print(f"Processing {index+1}/{len(df_input)}: {imdb_id}")

            details = get_title_details(imdb_id, driver)
            df_output = pd.concat([df_output, pd.DataFrame([details])], ignore_index=True)
            df_output.to_csv(output_file, index=False)
            print(f"Saved progress for {imdb_id}")

            # Add random pause between requests
            pause_time = random.uniform(*DELAY_BETWEEN_REQUESTS)
            print(f"Pausing for {pause_time:.2f} seconds...")
            time.sleep(pause_time)

    except KeyboardInterrupt:
        print("Scraping interrupted by user.")
    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:

        df_output.to_csv(output_file, index=False)
        print(f"Saved results to {output_file}")

        driver.quit()
        print("Driver closed.")

if __name__ == "__main__":

    input_file = "/content/drive/MyDrive/indian_tv_shows.csv"
    output_file = "/content/drive/MyDrive/tv_imdb_details.csv"

    main(input_file, output_file)

In [None]:
import requests
from bs4 import BeautifulSoup

def get_plot_synopsis(imdb_id):
    url = f"https://www.imdb.com/title/{imdb_id}/plotsummary/?ref_=tt_stry_pl#synopsis"
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Look for the synopsis section
    synopsis_container = soup.find('div', {'data-testid': 'sub-section-synopsis'})
    if synopsis_container:
        synopsis_div = synopsis_container.find('div', class_='ipc-html-content-inner-div')
        if synopsis_div:
            synopsis_text = synopsis_div.get_text(separator=' ', strip=True)
            return synopsis_text

    return None

# Example usage
imdb_id = "tt15239678"
synopsis = get_plot_synopsis(imdb_id)
print(synopsis[:500])


In [None]:
import time
import random
import re
import os
import pandas as pd
import requests
import concurrent.futures
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Constants
BASE_URL = "https://www.imdb.com"
COLUMNS = [
    "imdb_id", "title", "vote_average", "vote_count", "release_date",
    "original_language", "overview", "popularity", "genres",
    "production_companies", "release_year", "cast",
    "budget", "collection", "directors"
]
NUM_WORKERS = 8  
BATCH_SIZE = 20
MAX_RETRIES = 5
DELAY_BETWEEN_REQUESTS = (0.2, 1.0)  

# Setup session with retries
def create_session():
    session = requests.Session()

    retries = Retry(
        total=MAX_RETRIES,
        backoff_factor=0.5,  
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )

    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    session.timeout = 20

    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
        'Mozilla/5.0 (iPad; CPU OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1'
    ]
    session.headers.update({
        'User-Agent': random.choice(user_agents),
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    })

    return session

def get_title_details(imdb_id, session):
    """Scrape plot summary for a specific title using requests"""
    details = {col: "" for col in COLUMNS}
    details["imdb_id"] = imdb_id

    title_url = f"{BASE_URL}/title/{imdb_id}/"

    try:
        response = session.get(title_url)
        if response.status_code != 200:
            print(f"Error fetching {imdb_id}: HTTP status {response.status_code}")
            return details

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')


        title = None
        title_elem = soup.select_one('h1[data-testid="hero-title-block__title"]')
        if not title_elem:
            title_elem = soup.select_one('h1.hero__title')
        if title_elem:
            title = title_elem.text.strip()

        # If title not found, look for it in page title
        if not title:
            page_title = soup.title.text if soup.title else ""
            title_match = re.match(r'(.+?)(?:\s*\([0-9]{4}\))?\s*-\s*IMDb', page_title)
            if title_match:
                title = title_match.group(1).strip()

        details["title"] = title or ""

        # Rating (vote_average)
        rating_elem = soup.select_one('div[data-testid="hero-rating-bar__aggregate-rating__score"] span:first-child')
        if not rating_elem:
            rating_elem = soup.select_one('span.AggregateRatingButton__RatingScore-')

        if rating_elem:
            rating = rating_elem.text.strip().replace('/10', '')
            try:
                float(rating)
                details["vote_average"] = rating
            except:
                details["vote_average"] = ""

       # Vote count
        vote_count_selectors = [
            'div[class*="AggregateRatingButton__TotalRatingAmount"]',
            'div.sc-d541859f-3',
            'div.imdbRating strong',
            'span.ipc-rating-star--imdb + div',
            'a[href*="ratings"] span'
        ]

        vote_count = ""
        for selector in vote_count_selectors:
            el = soup.select_one(selector)
            if el and el.get_text(strip=True):
                vote_count = el.get_text(strip=True)
                break


        def extract_numeric_value(text):
            if not text:
                return ""
            match = re.search(r'([\d,.]+)\s*([KM]?)', text, re.IGNORECASE)
            if not match:
                return ""

            number, suffix = match.groups()
            number = number.replace(',', '').strip()

            try:
                num = float(number)
                suffix = suffix.upper()
                multiplier = {
                    "": 1,
                    "K": 1_000,
                    "M": 1_000_000,
                }.get(suffix, 1)
                return str(int(num * multiplier))
            except:
                return ""


        vote_count_clean = extract_numeric_value(vote_count)
        details["vote_count"] = vote_count_clean

        # Fallback: check raw text for "votes"
        if not details["vote_count"]:
            text_elements = soup.find_all(string=re.compile(r'\d[\d,]*\s+votes'))
            for txt in text_elements:
                votes_match = re.search(r'([\d,]+)\s*votes', txt)
                if votes_match:
                    details["vote_count"] = votes_match.group(1).replace(',', '')
                    break

        # Release date
        release_date_elem = soup.select_one('li[data-testid="title-details-releasedate"] .ipc-metadata-list-item__list-content-item')
        if not release_date_elem:
            release_date_elem = soup.select_one('a[href*="releaseinfo"]')

        if release_date_elem:
            release_date = release_date_elem.text.strip()
            details["release_date"] = release_date

            # Release year
            year_match = re.search(r'(\d{4})', release_date)
            if year_match:
                details["release_year"] = year_match.group(1)

        # If year not found in release date, try other selectors
        if not details["release_year"]:
            year_elem = soup.select_one('span.TitleBlockMetaData__ReleaseYear-')
            if year_elem:
                year_match = re.search(r'(\d{4})', year_elem.text)
                if year_match:
                    details["release_year"] = year_match.group(1)
                    
            if not details["release_year"]:
                page_title = soup.title.text if soup.title else ""
                year_match = re.search(r'\((\d{4})\)', page_title)
                if year_match:
                    details["release_year"] = year_match.group(1)

        # Overview/Plot
        plot_elem = soup.select_one('span[class*="GenresAndPlot__TextContainerBreakpointXL"]')
        if not plot_elem:
            plot_elem = soup.select_one('p[data-testid="plot"]')

        if plot_elem:
            details["overview"] = plot_elem.text.strip()

        # Genres
        genre_elems = soup.select('div[data-testid="genres"] span.ipc-chip__text')
        if not genre_elems:
            genre_elems = soup.select('.ipc-chip-list--baseAlt span.ipc-chip__text')

        genres = []
        for elem in genre_elems:
            text = elem.text.strip()
            if text and "more" not in text.lower():
                genres.append(text)

        details["genres"] = ", ".join(genres)

        # Language
        lang_elem = soup.select_one('li[data-testid="title-details-languages"] .ipc-metadata-list-item__list-content-item')
        if lang_elem:
            details["original_language"] = lang_elem.text.strip()

        # Production companies
        company_elems = soup.select('li[data-testid="title-details-companies"] .ipc-metadata-list-item__list-content-item')
        companies = [elem.text.strip() for elem in company_elems if elem.text.strip()]
        details["production_companies"] = ", ".join(companies)

        # Cast
        cast_elems = soup.select('a[data-testid="title-cast-item__actor"]')
        if not cast_elems:
            cast_elems = soup.select('td.primary_photo + td a')

        cast = [elem.text.strip() for elem in cast_elems[:10] if elem.text.strip()]
        details["cast"] = ", ".join(cast)

        def clean_money_value(value):
            if not value:
                return ""
            value = value.strip()
            value = re.sub(r'\(.*?\)', '', value).strip()
            multiplier = 1
            currency = ""

            if value.startswith('$'):
                currency = 'USD'
                value = value.replace('$', '')
                multiplier = 83 
            elif value.startswith('₹'):
                currency = 'INR'
                value = value.replace('₹', '')
                multiplier = 1
            elif value.startswith('€') or value.startswith('£'):
                return value 

            value = value.replace(',', '')
            try:
                num = float(re.findall(r'\d+(?:\.\d+)?', value)[0])
            except (IndexError, ValueError):
                return ""

            num_in_inr = int(num * multiplier)
            return f"₹{num_in_inr:,}"

        # Budget - from box office section
        budget_elem = soup.select_one('li[data-testid="title-boxoffice-budget"] .ipc-metadata-list-item__list-content-item')
        budget = budget_elem.text.strip() if budget_elem else ""
        details["budget"] = clean_money_value(budget)

        # Worldwide gross (collection)
        collection_elem = soup.select_one('li[data-testid="title-boxoffice-cumulativeworldwidegross"] .ipc-metadata-list-item__list-content-item')
        collection = collection_elem.text.strip() if collection_elem else ""
        details["collection"] = clean_money_value(collection)

        # Extract Director(s)
        directors = []
        director_section = soup.find(string=re.compile("Director"))
        if director_section and director_section.parent:
            director_li = director_section.find_parent('li')
            if director_li:
                director_links = director_li.select('a[href*="/name/"]')
                seen = set()
                for link in director_links:
                    name = link.text.strip()
                    if name and name not in seen:
                        seen.add(name)
                        directors.append(name)

        details["directors"] = ", ".join(directors)
        
        # Runtime
        runtime_elem = soup.select_one('li[data-testid="title-techspec_runtime"]')
        if runtime_elem:
            runtime_text = runtime_elem.get_text(separator=" ", strip=True)
            details["runtime"] = runtime_text.replace("Runtime", "").strip()
        else:
            details["runtime"] = None

       # Country of origin
        origin_elem = soup.select_one('li[data-testid="title-details-origin"]')
        if origin_elem:
            country_links = origin_elem.select('a.ipc-metadata-list-item__list-content-item')
            countries = [a.get_text(strip=True) for a in country_links if a.get_text(strip=True)]
            details["country_of_origin"] = ", ".join(countries)
        else:
            details["country_of_origin"] = None


        # Cast (Actor: Character)
        cast_items = []
        cast_blocks = soup.select('div[data-testid="title-cast-item"]')
        if cast_blocks:
            for block in cast_blocks[:10]:
                actor_elem = block.select_one('a[data-testid="title-cast-item__actor"]')
                if actor_elem:
                    actor_name = actor_elem.get_text(strip=True)

                    # Try to get character information
                    char_elem = block.select_one('a[data-testid="cast-item-characters-link"] span.sc-cd7dc4b7-4') or \
                              block.select_one('span.sc-cd7dc4b7-4')

                    extra_info = block.select_one('span.sc-cd7dc4b7-9')

                    if char_elem:
                        char_name = char_elem.get_text(strip=True)
                        if extra_info:
                            cast_items.append(f"{actor_name}: {char_name} {extra_info.get_text(strip=True)}")
                        else:
                            cast_items.append(f"{actor_name}: {char_name}")
                    else:
                        # Add just the actor name if no character info is found
                        cast_items.append(actor_name)

        # Fallback 
        if not cast_items:
            cast_elems = soup.select('a[data-testid="title-cast-item__actor"]')
            if not cast_elems:
                cast_elems = soup.select('td.primary_photo + td a')
            cast_items = [elem.get_text(strip=True) for elem in cast_elems[:10] if elem.get_text(strip=True)]

        details["cast"] = ", ".join(cast_items)

        # Calculate popularity
        try:
            vote_avg = float(details["vote_average"])
            vote_count = int(details["vote_count"].replace(",", ""))
            details["popularity"] = str(round(vote_avg * (vote_count / 1000), 2))
        except:
            details["popularity"] = ""



    except Exception as e:
        print(f"Error processing {imdb_id}: {str(e)[:100]}...")

    time.sleep(random.uniform(*DELAY_BETWEEN_REQUESTS))

    return details

def process_batch(imdb_ids_batch):
    """Process a batch of IMDb IDs using a dedicated session"""
    results = []
    session = create_session()

    for imdb_id in imdb_ids_batch:
        try:
            details = get_title_details(imdb_id, session)
            results.append(details)
            print(f"Processed: {imdb_id}")
        except Exception as e:
            print(f"Error processing {imdb_id}: {str(e)[:100]}...")
            results.append({"imdb_id": imdb_id, "overview": ""})

    return results

def get_completed_ids(output_file):
    """Get list of imdb_ids that have already been processed"""
    if not os.path.exists(output_file):
        return set()

    try:
        df = pd.read_csv(output_file)
        return set(df['imdb_id'].tolist())
    except:
        return set()

def main(input_file, output_file):
    """Main function to run the parallel requests-based scraper"""
    try:
        df_input = pd.read_csv(input_file)
        print(f"Loaded input file with {len(df_input)} entries")
    except Exception as e:
        print(f"Error reading input file: {e}")
        raise

    completed_ids = get_completed_ids(output_file)
    print(f"Found {len(completed_ids)} already processed entries")

    if os.path.exists(output_file):
        df_output = pd.read_csv(output_file)
    else:
        df_output = pd.DataFrame(columns=COLUMNS)

    ids_to_process = [
        row['imdb_id'] for _, row in df_input.iterrows()
        if row['imdb_id'] not in completed_ids
    ]
    print(f"Processing {len(ids_to_process)} remaining entries")

    batches = []
    batch_size = 20  
    for i in range(0, len(ids_to_process), batch_size):
        batches.append(ids_to_process[i:i + batch_size])

    try:
        for batch_idx, batch in enumerate(batches):
            print(f"Processing batch {batch_idx+1}/{len(batches)}")

            # Split batch into smaller chunks 
            sub_batches = []
            chunk_size = max(1, len(batch) // NUM_WORKERS)
            for i in range(0, len(batch), chunk_size):
                sub_batches.append(batch[i:i + chunk_size])

            # Process sub-batches in parallel
            with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
                futures = [executor.submit(process_batch, sub_batch) for sub_batch in sub_batches]

                batch_results = []
                for future in concurrent.futures.as_completed(futures):
                    batch_results.extend(future.result())

            new_data = pd.DataFrame(batch_results)
            df_output = pd.concat([df_output, new_data], ignore_index=True)

            # Save progress after each batch
            df_output.to_csv(output_file, index=False)
            print(f"Saved progress after batch {batch_idx+1}")
            if batch_idx < len(batches) - 1:
                time.sleep(random.uniform(1, 2))

    except KeyboardInterrupt:
        print("Scraping interrupted by user.")
    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        df_output.to_csv(output_file, index=False)
        print(f"Saved results to {output_file}")
        print("Scraping complete.")

if __name__ == "__main__":
    input_file = "/content/indian_movies_ids.csv"
    output_file = "/content/om.csv"

    main(input_file, output_file)

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
import os
import json
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import random
import logging
from datetime import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
]

# Constants
INPUT_CSV = '/content/plo.csv' 
OUTPUT_CSV = '/content/infoo.csv'
CHECKPOINT_FILE = '/content/progress.json'
SAVE_INTERVAL = 100 

def clean_title(title):
    """Clean the title by removing year and special characters if present."""
    title = re.sub(r'\s*\(\d{4}\)\s*', '', title)
    title = re.sub(r'[^\w\s]', '', title)
    return title.strip()

def search_wikipedia(title, try_variations=True):
    """
    Search Wikipedia for a movie/TV show and return the URL of the best matching result.
    Implements flexible matching to handle various title formats.
    """
    user_agent = random.choice(USER_AGENTS)

    # Extract the base title and year if present in format "Title (year)"
    year = None
    base_title = title
    year_match = re.search(r'(.+?)\s*\((\d{4}(?:[–\-]\d{4}| )?)\)', title)
    if year_match:
        base_title = year_match.group(1).strip()
        year = year_match.group(2).strip()

    # Create search variations in priority order
    search_variations = []

    # Priority 1: Exact title as provided
    search_variations.append(title)

    if try_variations:
        # Priority 2: If year is present, try "[Title] [year] film"
        if year:
            search_variations.append(f"{base_title} {year} film")
            search_variations.append(f"{base_title} {year} movie")

        # Priority 3: Base title with "film" or "movie" qualifier
        search_variations.append(f"{base_title} film")
        search_variations.append(f"{base_title} movie")

        # Priority 4: For TV Series, try those variations
        search_variations.append(f"{base_title} TV series")
        search_variations.append(f"{base_title} television series")

        # Priority 5: Just the base title
        search_variations.append(base_title)

        # Priority 6: For titles with subtitle (contains colon)
        if ':' in base_title:
            main_title = base_title.split(':', 1)[0].strip()
            search_variations.append(main_title)
            if year:
                search_variations.append(f"{main_title} {year} film")
            search_variations.append(f"{main_title} film")
            search_variations.append(f"{main_title} movie")
            search_variations.append(f"{main_title} TV series")

    # Remove duplicates while preserving order
    search_variations = list(dict.fromkeys(search_variations))

    logger.info(f"Searching for '{title}' with variations: {search_variations}")

    all_results = []

    # Try each search variation
    for search_term in search_variations:
        try:
            response = requests.get(
                'https://en.wikipedia.org/w/api.php',
                params={
                    'action': 'opensearch',
                    'search': search_term,
                    'limit': '5', 
                    'namespace': '0',
                    'format': 'json'
                },
                headers={'User-Agent': user_agent}
            )

            if response.status_code == 200:
                data = response.json()
                if data and len(data) >= 4 and data[3] and len(data[3]) > 0:
                    for i, result_title in enumerate(data[1]):
                        all_results.append({
                            'title': result_title,
                            'url': data[3][i],
                            'search_term': search_term
                        })
            time.sleep(0.5)

        except Exception as e:
            logger.error(f"Error searching for '{search_term}': {str(e)}")

    if not all_results:
        return None

    # Score and rank results
    scored_results = score_search_results(all_results, base_title, year)

    # Return the URL of the highest scoring result
    if scored_results:
        best_match = scored_results[0]
        logger.info(f"Best match for '{title}': '{best_match['title']}' (Score: {best_match['score']})")
        return best_match['url']

    return None

def score_search_results(results, base_title, year=None):
    """
    Score search results based on how well they match the original title and year.

    Args:
        results: List of dict with 'title' and 'url' keys
        base_title: Original title without year
        year: Year of release if available

    Returns:
        List of results sorted by score (highest first)
    """
    scored_results = []
    base_title_lower = base_title.lower()

    for result in results:
        result_title = result['title']
        result_lower = result_title.lower()
        score = 0

        # Exact match is best
        if result_lower == base_title_lower:
            score += 100

        elif result_lower.startswith(base_title_lower):
            score += 80

        elif base_title_lower in result_lower:
            score += 60

        if 'film' in result_lower or 'movie' in result_lower:
            score += 30
        elif 'tv series' in result_lower or 'television series' in result_lower:
            score += 25

        # Check for year match if we have a year
        if year:
            if year in result_title:
                score += 40
            elif re.search(r'\b\d{4}\b', result_title):
                score += 10

        # Is it specifically marked as a film or TV show?
        if re.search(r'\(\d{4}.*(?:film|movie)\)', result_title):
            score += 15
        elif re.search(r'\((?:TV|television) series\)', result_title):
            score += 15

        # Avoid disambiguation pages
        if '(disambiguation)' in result_lower:
            score -= 100

        if 'actor' in result_lower or 'actress' in result_lower or 'director' in result_lower:
            score -= 50

        scored_result = result.copy()
        scored_result['score'] = score
        scored_results.append(scored_result)

    return sorted(scored_results, key=lambda x: x['score'], reverse=True)

def extract_movie_info(url):
    """Extract franchise and other info from a Wikipedia page."""
    if not url:
        return " "

    user_agent = random.choice(USER_AGENTS)
    additional_info = []

    try:
        response = requests.get(url, headers={'User-Agent': user_agent})
        if response.status_code != 200:
            return f"Failed to access Wikipedia page: {response.status_code}", ""

        soup = BeautifulSoup(response.content, 'html.parser')

        infobox = soup.find('table', {'class': 'infobox'})
        if not infobox:
            return " "

        franchise_info = extract_franchise_info(soup, infobox)
        if franchise_info:
            additional_info.append(franchise_info)

        sequel_info = extract_sequel_info(soup)
        if sequel_info:
            additional_info.append(sequel_info)

        universe_info = extract_universe_info(soup, infobox)
        if universe_info:
            additional_info.append(universe_info)

        if not additional_info:
            return " "

        return "; ".join(additional_info)

    except Exception as e:
        logger.error(f"Error extracting info from {url}: {str(e)}")
        return f"Error: {str(e)}", ""


def extract_franchise_info(soup, infobox):
    """Extract franchise/series information."""
    # Method 1: Check infobox for series info
    for row in infobox.find_all('tr'):
        header = row.find('th')
        if header and header.text and ('series' in header.text.lower() or 'franchise' in header.text.lower()):
            value = row.find('td')
            if value:
                return f"{value.text.strip()}"

    # Method 2: Look for series categories
    categories = soup.find('div', {'id': 'mw-normal-catlinks'})
    if categories:
        category_links = categories.find_all('a')
        for link in category_links:
            text = link.text.lower()
            if 'film series' in text or 'franchise' in text:
                return f"{link.text}"

    # Method 3: Look for "Part of" text in the first few paragraphs
    content = soup.find('div', {'id': 'mw-content-text'})
    if content:
        paragraphs = content.find_all('p', limit=5)
        for p in paragraphs:
            text = p.text.lower()
            franchise_patterns = [
                r'part of (?:the )?(.*?(?:franchise|series|trilogy|universe))',
                r'(?:is|was) (?:the|a) ((?:\w+\s)+(?:film series|franchise|trilogy))',
                r'(?:is|was) (?:the|a) ((?:\w+\s)+(?:installment|film)) in the (.*?(?:series|franchise|trilogy))'
            ]

            for pattern in franchise_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    if len(match.groups()) > 1 and match.group(2):
                        return f"{match.group(2).strip()}"
                    return f"{match.group(1).strip()}"

    return None


def extract_sequel_info(soup):
    """Extract sequel/prequel information."""
    content = soup.find('div', {'id': 'mw-content-text'})
    if not content:
        return None

    # Search for sequel/prequel mentions in the first few sections
    sequel_info = []

    # Check for "See also" or "Other films" sections
    see_also = soup.find('span', {'id': re.compile('See_also|Other_films')})
    if see_also:
        see_also_section = see_also.parent.find_next('ul')
        if see_also_section:
            for li in see_also_section.find_all('li'):
                if 'sequel' in li.text.lower() or 'prequel' in li.text.lower() or 'follow' in li.text.lower():
                    sequel_info.append(li.text.strip())

    # Check the first few paragraphs
    paragraphs = content.find_all('p', limit=10)
    for p in paragraphs:
        text = p.text.lower()
        if 'sequel' in text or 'prequel' in text or 'preceded by' in text:
            # Extract the sequel/prequel information
            sequel_patterns = [
                r'(?:sequel to|follow-up)[^\.\,]*((?:titled |called |named )?[\w\s:]+)',
                r'(?:prequel|preceded by)[^\.\,]*((?:titled |called |named )?[\w\s:]+)'
            ]

            for pattern in sequel_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    sequel_text = match.group(0).strip()
                    sequel_text = re.sub(r'\[\d+\]', '', sequel_text)
                    sequel_info.append(sequel_text)

    if sequel_info:
        return f"{'; '.join(sequel_info)}"

    return None

def extract_universe_info(soup, infobox):
    """Extract universe information (MCU, DCEU, etc.)."""
    # Search for universe mentions in the infobox
    for row in infobox.find_all('tr'):
        header = row.find('th')
        if header and header.text and 'universe' in header.text.lower():
            value = row.find('td')
            if value:
                return f"{value.text.strip()}"

    # Check the first few paragraphs for universe mentions
    content = soup.find('div', {'id': 'mw-content-text'})
    if content:
        paragraphs = content.find_all('p', limit=5)
        for p in paragraphs:
            text = p.text.lower()
            universe_patterns = [
                r'(marvel cinematic universe)',
                r'(dc extended universe)',
                r'(dc universe)',
                r'(\w+ universe)',
                r'(star wars universe)',
                r'(wizarding world)'
            ]

            for pattern in universe_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    return f"{match.group(1)}"

    # Check categories
    categories = soup.find('div', {'id': 'mw-normal-catlinks'})
    if categories:
        category_links = categories.find_all('a')
        for link in category_links:
            text = link.text.lower()
            if 'universe' in text:
                return f"{link.text}"
    return None

def save_checkpoint(processed_indices, results):
    """Save checkpoint data to file."""
    checkpoint_data = {
        'processed_indices': processed_indices,
        'results': results,
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint_data, f)

    logger.info(f"Checkpoint saved: {len(processed_indices)} movies processed")

def load_checkpoint():
    """Load checkpoint data if exists."""
    if not os.path.exists(CHECKPOINT_FILE):
        return [], {}

    try:
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint_data = json.load(f)
        logger.info(f"Checkpoint loaded from {checkpoint_data['timestamp']}")
        logger.info(f"Resuming from {len(checkpoint_data['processed_indices'])} processed movies")
        return checkpoint_data['processed_indices'], checkpoint_data['results']
    
    except Exception as e:
        logger.error(f"Error loading checkpoint: {str(e)}")
        return [], {}

def save_results_to_csv(df, results):
    """Save current results to CSV file."""

    df_copy = df.copy()
    for idx, info in results.items():
        df_copy.loc[int(idx), 'additional_info'] = info

    df_copy.to_csv(OUTPUT_CSV, index=False)
    logger.info(f"Updated results saved to {OUTPUT_CSV}")

def process_movie(args):
    """Process a single movie row."""
    idx, movie_row = args
    title = movie_row['title']

    try:
        wiki_url = search_wikipedia(title, try_variations=True)

        if wiki_url:
            time.sleep(random.uniform(0.5, 2))
            additional_info = extract_movie_info(wiki_url)
            return idx, additional_info
        else:
            return idx, " "

    except Exception as e:
        logger.error(f"Error processing {title}: {str(e)}")
        return idx, f"Error: {str(e)}", ""

def main():

    logger.info(f"Loading CSV file: {INPUT_CSV}...")
    try:
        df = pd.read_csv(INPUT_CSV)
    except Exception as e:
        logger.error(f"Error loading CSV: {str(e)}")
        return

    if 'additional_info' not in df.columns:
        df['additional_info'] = None

    processed_indices, results = load_checkpoint()

    movie_rows = df.to_dict('records')
    to_process = [(idx, row) for idx, row in enumerate(movie_rows) if str(idx) not in processed_indices]

    if not to_process:
        logger.info("All movies have been processed already!")
        return

    logger.info(f"Starting to process {len(to_process)} remaining movies...")

    with ThreadPoolExecutor(max_workers=8) as executor: 

        pbar = tqdm(total=len(df), desc="Scraping Wikipedia")
        pbar.update(len(processed_indices))

        batch_size = min(SAVE_INTERVAL, len(to_process))
        for i in range(0, len(to_process), batch_size):
            batch = to_process[i:i+batch_size]
            batch_results = list(executor.map(process_movie, batch))

            # Update results and processed indices
            for idx, info in batch_results:
                results[str(idx)] = info

                processed_indices.append(str(idx))

            pbar.update(len(batch))
            save_checkpoint(processed_indices, results)
            save_results_to_csv(df, results)

        pbar.close()


    save_results_to_csv(df, results)
    if os.path.exists(CHECKPOINT_FILE):
        os.remove(CHECKPOINT_FILE)

    logger.info("Processing complete!")

if __name__ == "__main__":
    main()