# Webcrawling process (cont')

In [15]:
import pandas as pd
import requests
import warnings
warnings.filterwarnings("ignore")

## Get Movie Details of each film
This is the hardest part, not only does it takes time but there is also a risk of being temporarily/permanently blocked by the site (Error 403 Forbidden)...

### List of browers to rotate

In [16]:
HEADERS_LIST = [
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:110.0) Gecko/20100101 Firefox/110.0",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",
    "X-Requested-With": "XMLHttpRequest"
  },
]

### Old: Parallel Webscraping - 20s/movie

In [17]:
from bs4 import BeautifulSoup
import re
from datetime import datetime
import random

def parse_movie_details(soup, url):
  """
  Shared parsing logic for movie details from BeautifulSoup object.
  Returns a dictionary with the structured data.
  """
  # Find the Movie Details section
  movie_details = {'link': url}
  
  # Get table #1 (index 1) - the metrics table
  production_budget = ''
  all_tables = soup.find_all('table', limit=4)
  metrics_table = all_tables[1]
  
  # Search all rows in table #1 for "Production Budget"
  rows = metrics_table.find_all('tr')
  for row in rows:
    cells = row.find_all(['td', 'th'])
    for cell in (cells):
      text = cell.get_text(strip=True)
      # Check if this cell contains "Production Budget"
      if 'production budget' in text.lower():
        production_budget = re.search(r'\$?([\d,]+)(?![\d,])', text).group(0)
        movie_details['Production Budget'] = production_budget
  
  # Look for the table with Movie Details
  details_table = all_tables[3]
  
  if details_table:
    rows = details_table.find_all('tr')
    for row in rows:
      cells = row.find_all(['td', 'th'])
      if len(cells) >= 2:
          key = cells[0].get_text(strip=True).replace('\xa0', ' ')
          value = cells[1].get_text(strip=True).replace('\xa0', ' ')
          
          # Clean up the key (remove colons and extra spaces)
          key = key.replace(':', '').strip()
          
          # Skip unwanted fields completely
          if key in ['Video Release', 'Comparisons', 'Keywords', 'Source', 'Languages']:
              continue
          
          # MPAA Rating:
          if key == 'MPAA Rating':
            allowed_ratings = ['PG-13', 'NC-17', 'PG', 'R', 'G']  # Order matters: check longer ones first
            upper_value = value.upper()
            found_rating = None
            for rating in allowed_ratings:
              if rating in upper_value:
                found_rating = rating
                break
            movie_details['MPAA Rating'] = found_rating

          # Handle Production Countries and Languages separately
          if key == 'Production Countries':
              # Check if Languages data is mixed in
              if 'Languages:' in value:
                  parts = value.split('Languages:')
                  movie_details['Production Countries'] = parts[0].strip()
                  if len(parts) > 1:
                      movie_details['Languages'] = parts[1].strip()
              else:
                  movie_details['Production Countries'] = value
          elif key == 'Languages':
              movie_details['Languages'] = value
          else:
              # Store all other fields
              movie_details[key] = value
  
  # Extract earliest release date from Domestic and International releases
  release_dates = []
  
  # Extract dates from Domestic Releases
  if 'Domestic Releases' in movie_details:
      domestic_text = movie_details['Domestic Releases']
      # Look for date patterns like "February 14th, 2025"
      domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
      release_dates.extend(domestic_dates)
  
  # Extract dates from International Releases
  if 'International Releases' in movie_details:
      intl_text = movie_details['International Releases']
      # Look for date patterns like "January 29th, 2025"
      intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
      release_dates.extend(intl_dates)
  
  # Find the earliest date
  if release_dates:
      try:
          # Convert dates to datetime objects for comparison
          parsed_dates = []
          for date_str in release_dates:
              try:
                  # Handle ordinal suffixes (st, nd, rd, th)
                  clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                  parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                  parsed_dates.append(parsed_date)
              except:
                  continue
          
          if parsed_dates:
              earliest_date = min(parsed_dates)
              movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
      except:
          pass
  
  # Remove the original release fields since we now have Release Date
  movie_details.pop('Domestic Releases', None)
  movie_details.pop('International Releases', None)
  
  return movie_details

In [18]:
def scrape_movie_details(url):
  """
  Scrape the Movie Details section from the-numbers.com
  Returns a dictionary with the structured data
  """
  header = random.choice(HEADERS_LIST)  # Use choice() not choices()
  
  s = requests.Session()
  r = s.get(url, headers=header, timeout=15)
  soup = BeautifulSoup(r.text, 'html.parser')
  
  # Use the shared parsing logic
  return parse_movie_details(soup, url)

### New: Async webscraping - 1s/movie

In [19]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm

max_concurrency = 10
sem = asyncio.Semaphore(max_concurrency)
timeout_urls = []

async def scrape_movie_details_async(session, header, url):
  """
  Async scraping of movie details.
  """
  async with sem:
    try:
      async with session.get(url, headers=header, timeout=25) as r:
        status = r.status
        if status != 200:
          print(f"HTTP {status} for {url}")
          return None
        html = await r.text()
        soup = BeautifulSoup(html, 'html.parser')
        
        # Use the shared parsing logic
        data = parse_movie_details(soup, url)
        # Treat dicts with only 'link' as invalid
        if isinstance(data, dict) and len(data) <= 1:
          print(f"Parsed no data for {url}")
          return None
        return data
    
    except asyncio.TimeoutError:
        print(f"Timeout error for {url}")
        return None
    except Exception as e:
        print(f"Error for {url}: {e}")
        return None

async def scrape_batch(session, header, urls):
  """Async scraping, continued - callable function."""
  if session is None:
    async with aiohttp.ClientSession() as session:
      tasks = [scrape_movie_details_async(session, header, url) for url in urls]
      return await tqdm.gather(*tasks)
  else:
    tasks = [scrape_movie_details_async(session, header, url) for url in urls]
    return await tqdm.gather(*tasks)

In [20]:
async def process_batch_with_error_handling(session, header, urls, batch_num):
  """Process one batch with error handling"""
  try:
    print(f"========== PROCESSING BATCH {batch_num:02d} ({len(urls)} URLs)... ==========")

    results = await scrape_batch(session, header, urls)
    
    # Keep only non-empty dicts with more than just the link
    valid_results = []
    for r in results:
      if isinstance(r, dict) and len(r) > 1:
        valid_results.append(r)
    
    if valid_results:
      # Convert to DataFrame
      df = pd.DataFrame(valid_results)
      
      # Save to CSV
      filename = f'Movie Details/movie_details_{batch_num:02d}.csv'
      df.to_csv(filename, index=False)
      print(f"‚úÖ Batch {batch_num:02d} completed: {len(valid_results)} movies saved to {filename}")
      return len(valid_results)
    else:
      print(f"‚ö†Ô∏è Batch {batch_num:02d} completed but no valid data")
      return None
            
  except Exception as e:
    print(f"‚ùå Error in batch {batch_num:02d}: {e}")
    return None

async def process_all_batches(urls, start_at=0):
  """Process all batches with error handling.
  Accepts:
    - string URL ‚Üí one batch with one URL
    - list[str]  ‚Üí one batch with many URLs
    - list[list[str]] ‚Üí multiple batches (original behavior)
  """
  # Normalize input into list of batches (list[list[str]])
  if isinstance(urls, str):
    batches = [[urls]]
  elif isinstance(urls, list):
    if len(urls) == 0:
      batches = []
    elif all(isinstance(u, str) for u in urls):
      batches = [urls]
    else:
      batches = urls
  else:
    batches = []
  
  # Clamp start_at
  if start_at is None or not isinstance(start_at, int):
    start_at = 0
  if start_at < 0:
    start_at = 0
  if 90 > start_at >= len(batches) > 0:
    start_at = len(batches) - 1
  if start_at == 90:
    start_at = 90
  
  total_processed = 0

  async with aiohttp.ClientSession() as session:
    batch_num = start_at
    for batch_urls in batches:
      batch_num += 1
      i = (batch_num - 1) // 2 % len(HEADERS_LIST)
      header = HEADERS_LIST[i]
      processed_count = await process_batch_with_error_handling(session, header, batch_urls, batch_num)
      total_processed += processed_count
      
      # Small delay between batches to be nice to the server
      await asyncio.sleep(1)
  
  print(f"\nüéâ All batches completed! Total movies processed: {total_processed}")

In [21]:
import pandas as pd
df_WW_all = pd.read_csv("WW_all.csv")
all_movie_details = []
links = [df_WW_all['link'].tolist()[x:x+200] for x in range(0, len(df_WW_all), 200)]

In [22]:
links_to_scrape = links[24]
isinstance(links_to_scrape, list)

True

In [23]:
# scrape_movie_details(links_to_scrape)

In [24]:
await process_all_batches(urls=links_to_scrape, start_at=98)



 40%|‚ñà‚ñà‚ñà‚ñà      | 16/40 [00:29<00:48,  2.00s/it]

Timeout error for https://www.the-numbers.com/movie/Grand-Prix-of-Europe-(2025-Germany)#tab=summary


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 19/40 [00:34<00:34,  1.63s/it]

Timeout error for https://www.the-numbers.com/movie/Chosen-The-Last-Supper-Part-2-(2025)#tab=summary


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 21/40 [00:36<00:22,  1.20s/it]

Timeout error for https://www.the-numbers.com/movie/Hi-Five-(2025-South-Korea)#tab=summary


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 23/40 [00:45<00:45,  2.70s/it]

Timeout error for https://www.the-numbers.com/movie/Death-of-a-Unicorn-(2025)#tab=summary


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 24/40 [00:46<00:35,  2.19s/it]

Timeout error for https://www.the-numbers.com/movie/Presence-(2025)#tab=summary


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 25/40 [00:51<00:45,  3.03s/it]

Timeout error for https://www.the-numbers.com/movie/Padre-No-Hay-Mas-Que-Uno-5-(2025-Spain)#tab=summary


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 27/40 [00:57<00:38,  2.99s/it]

Timeout error for https://www.the-numbers.com/movie/You-Are-The-Best-(2025-China)#tab=summary


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 28/40 [00:59<00:32,  2.69s/it]

Timeout error for https://www.the-numbers.com/movie/Na-derevnyu-dedushke-(2025-Russia)#tab=summary


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 29/40 [01:00<00:24,  2.18s/it]

Timeout error for https://www.the-numbers.com/movie/Colorful-Stage-The-Movie-A-Miku-Who-Cant-Sing-(2025-Japan)#tab=summary


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 30/40 [01:01<00:18,  1.83s/it]

Timeout error for https://www.the-numbers.com/movie/Chosen-The-Last-Supper-Part-2-(2025)#tab=summary


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 31/40 [01:02<00:14,  1.58s/it]

Timeout error for https://www.the-numbers.com/movie/Three-Kingdoms-Starlit-Heroes-(2025-China)#tab=summary


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 32/40 [01:08<00:23,  2.90s/it]

Timeout error for https://www.the-numbers.com/movie/Cang-Mang-De-Tian-Ya-Shi-Wo-De-Ai-(2025-China)#tab=summary


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 33/40 [01:11<00:20,  2.93s/it]

Timeout error for https://www.the-numbers.com/movie/Io-Sono-La-Fine-Del-Mondo-(2025-Italy)#tab=summary


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 34/40 [01:12<00:14,  2.35s/it]

Timeout error for https://www.the-numbers.com/movie/Friendship-(2025)#tab=summary


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 35/40 [01:17<00:15,  3.15s/it]

Timeout error for https://www.the-numbers.com/movie/Chosen-The-Last-Supper-Part-3-(2025)#tab=summary


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 36/40 [01:20<00:12,  3.10s/it]

Timeout error for https://www.the-numbers.com/movie/Last-Rodeo-The-(2025)#tab=summary


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 37/40 [01:23<00:09,  3.07s/it]

Timeout error for https://www.the-numbers.com/movie/Hotline-Beijing-(2025-China)#tab=summary


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 38/40 [01:25<00:05,  2.75s/it]

Timeout error for https://www.the-numbers.com/movie/Red-Silk-(2025-Russia)#tab=summary


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 39/40 [01:26<00:02,  2.23s/it]

Timeout error for https://www.the-numbers.com/movie/Dracula-A-Love-Tale-(2025-France)#tab=summary


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [01:27<00:00,  2.18s/it]

Timeout error for https://www.the-numbers.com/movie/Eddington-(2025)#tab=summary
‚úÖ Batch 99 completed: 20 movies saved to Movie Details/movie_details_99.csv






üéâ All batches completed! Total movies processed: 20


## Concatenate to one dataframe & Export to csv 

In [25]:
import glob

def export_all_movie_details():
  all_movie_details = [pd.read_csv(file) for file in glob.glob('Movie Details/*.csv')]
  global movie_details_df
  # Concatenate all series to a dataframe
  if all_movie_details:
    movie_details_df = pd.concat(all_movie_details, axis=0, ignore_index=True)
    move_col = movie_details_df.pop('Release Date')
    movie_details_df.insert(1,'Release Date', move_col)
    
    print("DataFrame shape:", movie_details_df.shape)
    print("\nDataFrame columns:", movie_details_df.columns.tolist())
    print("\nDataFrame content:")
  else:
    movie_details_df = pd.DataFrame()

  movie_details_df.to_csv('movie_details.csv',index=False)

In [26]:
export_all_movie_details()

DataFrame shape: (5180, 12)

DataFrame columns: ['link', 'Release Date', 'Production Budget', 'MPAA Rating', 'Running Time', 'Franchise', 'Genre', 'Production Method', 'Creative Type', 'Production/Financing Companies', 'Production Countries', 'Languages']

DataFrame content:


### Re-run timeouts & append to final dataframe:

In [None]:
import pandas as pd
df_WW_all = pd.read_csv("WW_all.csv")
movie_details_df = pd.read_csv("movie_details.csv").dropna(subset=['link'])
# Find differences

async def retry_timeouts(retry_start=91):
  timeouts_df = df_WW_all[~df_WW_all['link'].isin(movie_details_df['link'])]
  timeout_urls = [timeouts_df['link'].tolist()[x:x+400] for x in range(0, len(timeouts_df), 400)]
  if not timeout_urls:
    print("No more timeouts :D")
  else:
    print(f"Timeouts: {timeouts_df.shape[0]} missing movies.")
    print(f"Retrying timed out URLs...")
    await process_all_batches(urls=timeout_urls, start_at=retry_start)
    export_all_movie_details()
    await retry_timeouts(retry_start + 1)

await retry_timeouts()

## Merge to final `df_WW_all`

In [None]:
import pandas as pd
import numpy as np
import re

df_WW_all = pd.read_csv("WW_all.csv")
movie_details_df = pd.read_csv("movie_details.csv").dropna(subset=['link'])
# MERGE
df = pd.merge(df_WW_all, movie_details_df, on='link', how='right').sort_values(by=['Year Recorded','Rank'])

# CLEAN MPAA RATING - extract only the rating (PG, PG-13, R, G, NC-17)
if 'MPAA Rating' in df.columns:
    allowed_ratings = ['PG-13', 'NC-17', 'PG', 'R', 'G']  # Order matters: check longer ones first
    def extract_rating(x):
        if pd.isna(x):
            return x
        x_str = str(x).upper()
        for rating in allowed_ratings:
            if rating in x_str:
                return rating
        return None
    df['MPAA Rating'] = df['MPAA Rating'].apply(extract_rating)

# COMBINE 2 COLUMNS
df["Distributor_y"] = np.where(
    df["Distributor"].isna() | (df["Distributor"] == ""),  # A2 = ""
    df["Production/Financing Companies"].apply(
        lambda x: x.split(",")[0].strip() if isinstance(x, str) and "," in x else np.nan
    ),
    np.nan
)
df["Distributor_y"] = df["Distributor_y"].fillna(df["Distributor"])
df = df.drop(columns=['Distributor'])

# RENAME
df = df.rename(columns={'Distributor_y': 'Distributor',
                      'Genre_y': 'Genre',
                      'Release Date_y': 'Release Date'})

# REORDER
df = df[['link', 'Year Recorded', 'Rank', 'Movie', 'Worldwide Box Office', 'Domestic Box Office', 'International Box Office', 'Domestic Share', 'Distributor', 'Production Budget', 'Running Time', 'Genre', 'Production Method', 'Creative Type', 'MPAA Rating', 'Franchise', 'Production Countries', 'Release Date']].drop_duplicates(subset=['Movie'])
df.to_csv('WW_all_new.csv', index=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4784 entries, 0 to 4946
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   link                      4784 non-null   object
 1   Year Recorded             4784 non-null   int64 
 2   Rank                      4784 non-null   int64 
 3   Movie                     4784 non-null   object
 4   Worldwide Box Office      4784 non-null   object
 5   Domestic Box Office       3462 non-null   object
 6   International Box Office  4744 non-null   object
 7   Domestic Share            3462 non-null   object
 8   Distributor               3300 non-null   object
 9   Production Budget         2713 non-null   object
 10  Running Time              4476 non-null   object
 11  Genre                     4701 non-null   object
 12  Production Method         4686 non-null   object
 13  Creative Type             4637 non-null   object
 14  MPAA Rating               357