# Webcrawling process (cont') - đang bỏ dở, có thể không làm

In [19]:
import pandas as pd
import requests
import warnings
# warnings.filterwarnings("ignore")

## Get Movie Details of each film
This is the hardest part, not only does it takes time but there is also a risk of being temporarily/permanently blocked by the site (Error 403 Forbidden)...

### List of browers to rotate

In [99]:
HEADERS_LIST = [
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:110.0) Gecko/20100101 Firefox/110.0",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",
    "X-Requested-With": "XMLHttpRequest"
  },
]

### Old: Parallel Webscraping - 20s/movie

In [21]:
from bs4 import BeautifulSoup
import re
from datetime import datetime
import random

def parse_movie_details(soup, url):
  """
  Shared parsing logic for movie details from BeautifulSoup object.
  Returns a dictionary with the structured data.
  """
  # Find the Movie Details section
  movie_details = {'link': url}
  
  # Get table #1 (index 1) - the metrics table
  production_budget = ''
  all_tables = soup.find_all('table', limit=4)
  metrics_table = all_tables[1]
  
  # Search all rows in table #1 for "Production Budget"
  rows = metrics_table.find_all('tr')
  for row in rows:
    cells = row.find_all(['td', 'th'])
    for cell in (cells):
      text = cell.get_text(strip=True)
      # Check if this cell contains "Production Budget"
      if 'production budget' in text.lower():
        production_budget = re.search(r'\$?([\d,]+)(?![\d,])', text).group(0)
        movie_details['Production Budget'] = production_budget
  
  # Look for the table with Movie Details
  details_table = all_tables[3]
  
  if details_table:
    rows = details_table.find_all('tr')
    for row in rows:
      cells = row.find_all(['td', 'th'])
      if len(cells) >= 2:
          key = cells[0].get_text(strip=True).replace('\xa0', ' ')
          value = cells[1].get_text(strip=True).replace('\xa0', ' ')
          
          # Clean up the key (remove colons and extra spaces)
          key = key.replace(':', '').strip()
          
          # Skip unwanted fields completely
          if key in ['Video Release', 'MPAA Rating', 'Franchise', 'Comparisons', 'Keywords', 'Source', 'Creative Type', 'Languages']:
              continue
          
          # Handle Production Countries and Languages separately
          if key == 'Production Countries':
              # Check if Languages data is mixed in
              if 'Languages:' in value:
                  parts = value.split('Languages:')
                  movie_details['Production Countries'] = parts[0].strip()
                  if len(parts) > 1:
                      movie_details['Languages'] = parts[1].strip()
              else:
                  movie_details['Production Countries'] = value
          elif key == 'Languages':
              movie_details['Languages'] = value
          else:
              # Store all other fields
              movie_details[key] = value
  
  # Extract earliest release date from Domestic and International releases
  release_dates = []
  
  # Extract dates from Domestic Releases
  if 'Domestic Releases' in movie_details:
      domestic_text = movie_details['Domestic Releases']
      # Look for date patterns like "February 14th, 2025"
      domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
      release_dates.extend(domestic_dates)
  
  # Extract dates from International Releases
  if 'International Releases' in movie_details:
      intl_text = movie_details['International Releases']
      # Look for date patterns like "January 29th, 2025"
      intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
      release_dates.extend(intl_dates)
  
  # Find the earliest date
  if release_dates:
      try:
          # Convert dates to datetime objects for comparison
          parsed_dates = []
          for date_str in release_dates:
              try:
                  # Handle ordinal suffixes (st, nd, rd, th)
                  clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                  parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                  parsed_dates.append(parsed_date)
              except:
                  continue
          
          if parsed_dates:
              earliest_date = min(parsed_dates)
              movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
      except:
          pass
  
  # Remove the original release fields since we now have Release Date
  movie_details.pop('Domestic Releases', None)
  movie_details.pop('International Releases', None)
  
  return movie_details

In [22]:
def scrape_movie_details(url):
  """
  Scrape the Movie Details section from the-numbers.com
  Returns a dictionary with the structured data
  """
  header = random.choice(HEADERS_LIST)  # Use choice() not choices()
  
  s = requests.Session()
  r = s.get(url, headers=header, timeout=15)
  soup = BeautifulSoup(r.text, 'html.parser')
  
  # Use the shared parsing logic
  return parse_movie_details(soup, url)

### New: Async webscraping - 1s/movie

In [23]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm

max_concurrency = 10
sem = asyncio.Semaphore(max_concurrency)
timeout_urls = []

async def scrape_movie_details_async(session, header, url):
  """
  Async scraping of movie details.
  """
  async with sem:
    try:
      async with session.get(url, headers=header, timeout=25) as r:
        status = r.status
        if status != 200:
          print(f"HTTP {status} for {url}")
          return None
        html = await r.text()
        soup = BeautifulSoup(html, 'html.parser')
        
        # Use the shared parsing logic
        data = parse_movie_details(soup, url)
        # Treat dicts with only 'link' as invalid
        if isinstance(data, dict) and len(data) <= 1:
          print(f"Parsed no data for {url}")
          return None
        return data
    
    except asyncio.TimeoutError:
        print(f"Timeout error for {url}")
        return None
    except Exception as e:
        print(f"Error for {url}: {e}")
        return None

async def scrape_batch(session, header, urls):
  """Async scraping, continued - callable function."""
  if session is None:
    async with aiohttp.ClientSession() as session:
      tasks = [scrape_movie_details_async(session, header, url) for url in urls]
      return await tqdm.gather(*tasks)
  else:
    tasks = [scrape_movie_details_async(session, header, url) for url in urls]
    return await tqdm.gather(*tasks)

In [94]:
async def process_batch_with_error_handling(session, header, urls, batch_num):
  """Process one batch with error handling"""
  try:
    print(f"========== PROCESSING BATCH {batch_num:02d} ({len(urls)} URLs)... ==========")

    results = await scrape_batch(session, header, urls)
    
    # Keep only non-empty dicts with more than just the link
    valid_results = []
    for r in results:
      if isinstance(r, dict) and len(r) > 1:
        valid_results.append(r)
    
    if valid_results:
      # Convert to DataFrame
      df = pd.DataFrame(valid_results)
      
      # Save to CSV
      filename = f'Movie Details/movie_details_{batch_num:02d}.csv'
      df.to_csv(filename, index=False)
      print(f"✅ Batch {batch_num:02d} completed: {len(valid_results)} movies saved to {filename}")
      return len(valid_results)
    else:
      print(f"⚠️ Batch {batch_num:02d} completed but no valid data")
      return None
            
  except Exception as e:
    print(f"❌ Error in batch {batch_num:02d}: {e}")
    return None

async def process_all_batches(urls, start_at=0):
  """Process all batches with error handling.
  Accepts:
    - string URL → one batch with one URL
    - list[str]  → one batch with many URLs
    - list[list[str]] → multiple batches (original behavior)
  """
  # Normalize input into list of batches (list[list[str]])
  if isinstance(urls, str):
    batches = [[urls]]
  elif isinstance(urls, list):
    if len(urls) == 0:
      batches = []
    elif all(isinstance(u, str) for u in urls):
      batches = [urls]
    else:
      batches = urls
  else:
    batches = []
  
  # Clamp start_at
  if start_at is None or not isinstance(start_at, int):
    start_at = 0
  if start_at < 0:
    start_at = 0
  if 90 > start_at >= len(batches) > 0:
    start_at = len(batches) - 1
  if start_at == 90:
    start_at = 90
  
  total_processed = 0

  async with aiohttp.ClientSession() as session:
    batch_num = start_at
    for batch_urls in batches:
      batch_num += 1
      i = (batch_num - 1) // 2 % len(HEADERS_LIST)
      header = HEADERS_LIST[i]
      processed_count = await process_batch_with_error_handling(session, header, batch_urls, batch_num)
      total_processed += processed_count
      
      # Small delay between batches to be nice to the server
      await asyncio.sleep(1)
  
  print(f"\n🎉 All batches completed! Total movies processed: {total_processed}")

In [59]:
import pandas as pd
df_WW_all = pd.read_csv("WW_all.csv")
all_movie_details = []
links = [df_WW_all['link'].tolist()[x:x+200] for x in range(0, len(df_WW_all), 200)]

In [62]:
links_to_scrape = links
isinstance(links_to_scrape, list)

True

In [27]:
# scrape_movie_details(links_to_scrape)

In [63]:
await process_all_batches(urls=links_to_scrape)



100%|██████████| 200/200 [01:55<00:00,  1.73it/s]


✅ Batch 01 completed: 200 movies saved to Movie Details/movie_details_01.csv


100%|██████████| 200/200 [00:54<00:00,  3.68it/s]


✅ Batch 02 completed: 200 movies saved to Movie Details/movie_details_02.csv


100%|██████████| 200/200 [01:24<00:00,  2.37it/s]


✅ Batch 03 completed: 200 movies saved to Movie Details/movie_details_03.csv


100%|██████████| 200/200 [01:57<00:00,  1.70it/s]


✅ Batch 04 completed: 200 movies saved to Movie Details/movie_details_04.csv


 22%|██▎       | 45/200 [00:41<02:44,  1.06s/it]

Timeout error for https://www.the-numbers.com/movie/Dasavatharam#tab=summary
Timeout error for https://www.the-numbers.com/movie/Public-Enemies-(2009)#tab=summary


 24%|██▎       | 47/200 [00:42<02:02,  1.25it/s]

Timeout error for https://www.the-numbers.com/movie/Did-You-Hear-About-the-Morgans#tab=summary


 24%|██▍       | 49/200 [00:45<02:43,  1.08s/it]

Timeout error for https://www.the-numbers.com/movie/Transformers-Revenge-of-the-Fallen#tab=summary


 30%|██▉       | 59/200 [01:01<02:47,  1.18s/it]

Timeout error for https://www.the-numbers.com/movie/Crank-2-High-Voltage#tab=summary


 32%|███▏      | 64/200 [01:06<01:50,  1.23it/s]

Timeout error for https://www.the-numbers.com/movie/Its-Complicated#tab=summary


 36%|███▋      | 73/200 [01:24<04:20,  2.05s/it]

Timeout error for https://www.the-numbers.com/movie/Dolphins-and-Whales-Tribes-of-the-Ocean-3D#tab=summary


 42%|████▎     | 85/200 [01:36<02:02,  1.07s/it]

Timeout error for https://www.the-numbers.com/movie/In-the-Name-of-the-King-A-Dungeon-Siege-Tale#tab=summary


 48%|████▊     | 97/200 [01:49<01:17,  1.32it/s]

Timeout error for https://www.the-numbers.com/movie/Cash-(2008)#tab=summary


 50%|█████     | 100/200 [01:52<01:04,  1.55it/s]

Timeout error for https://www.the-numbers.com/movie/Bikur-Ha-Tizmoret-(2008)#tab=summary


 56%|█████▋    | 113/200 [02:08<01:24,  1.03it/s]

Timeout error for https://www.the-numbers.com/movie/Avatar#tab=summary


100%|██████████| 200/200 [02:27<00:00,  1.35it/s]


✅ Batch 05 completed: 189 movies saved to Movie Details/movie_details_05.csv


100%|██████████| 200/200 [01:20<00:00,  2.49it/s]


✅ Batch 06 completed: 200 movies saved to Movie Details/movie_details_06.csv


100%|██████████| 200/200 [01:42<00:00,  1.95it/s]


✅ Batch 07 completed: 200 movies saved to Movie Details/movie_details_07.csv


 57%|█████▊    | 115/200 [01:25<01:12,  1.17it/s]

Timeout error for https://www.the-numbers.com/movie/Prometheus#tab=summary


 72%|███████▏  | 143/200 [01:48<00:40,  1.40it/s]

Timeout error for https://www.the-numbers.com/movie/Straw-Dogs#tab=summary


100%|██████████| 200/200 [02:30<00:00,  1.33it/s]


✅ Batch 08 completed: 198 movies saved to Movie Details/movie_details_08.csv


100%|██████████| 200/200 [01:17<00:00,  2.59it/s]


✅ Batch 09 completed: 200 movies saved to Movie Details/movie_details_09.csv


100%|██████████| 200/200 [01:37<00:00,  2.05it/s]


✅ Batch 10 completed: 200 movies saved to Movie Details/movie_details_10.csv


 32%|███▏      | 63/200 [01:00<03:18,  1.45s/it]

Timeout error for https://www.the-numbers.com/movie/Million-Ways-to-Die-in-the-West-A#tab=summary


 32%|███▎      | 65/200 [01:02<02:35,  1.15s/it]

Timeout error for https://www.the-numbers.com/movie/Xi-you-ji-Da-nao-tian-gong#tab=summary


 36%|███▌      | 72/200 [01:11<02:36,  1.22s/it]

Timeout error for https://www.the-numbers.com/movie/John-Wick#tab=summary


 41%|████      | 82/200 [01:21<01:41,  1.17it/s]

Timeout error for https://www.the-numbers.com/movie/Asterix-Le-Domaine-des-dieux-(2014-France)#tab=summary


 56%|█████▌    | 112/200 [01:53<01:07,  1.30it/s]

Timeout error for https://www.the-numbers.com/movie/Other-Woman-The-(2014)#tab=summary


 57%|█████▋    | 114/200 [01:55<01:21,  1.05it/s]

Timeout error for https://www.the-numbers.com/movie/Seventh-Son#tab=summary


 62%|██████▏   | 124/200 [02:12<01:23,  1.09s/it]

Timeout error for https://www.the-numbers.com/movie/Fault-in-Our-Stars-The#tab=summary


 66%|██████▌   | 132/200 [02:16<00:35,  1.94it/s]

Timeout error for https://www.the-numbers.com/movie/Water-Diviner-The#tab=summary


100%|██████████| 200/200 [03:02<00:00,  1.10it/s]


✅ Batch 11 completed: 192 movies saved to Movie Details/movie_details_11.csv


 19%|█▉        | 38/200 [00:31<02:12,  1.22it/s]

Timeout error for https://www.the-numbers.com/movie/Himalaya-(2015-South-Korea)#tab=summary


 43%|████▎     | 86/200 [01:05<01:07,  1.68it/s]

Timeout error for https://www.the-numbers.com/movie/Island-of-Lemurs-Madagascar#tab=summary


100%|██████████| 200/200 [01:57<00:00,  1.71it/s]


✅ Batch 12 completed: 198 movies saved to Movie Details/movie_details_12.csv


100%|██████████| 200/200 [01:10<00:00,  2.85it/s]


✅ Batch 13 completed: 200 movies saved to Movie Details/movie_details_13.csv


100%|██████████| 200/200 [01:56<00:00,  1.72it/s]


✅ Batch 14 completed: 200 movies saved to Movie Details/movie_details_14.csv


 24%|██▎       | 47/200 [00:47<04:35,  1.80s/it]

Timeout error for https://www.the-numbers.com/movie/Ghost-in-the-Shell#tab=summary


 24%|██▍       | 48/200 [00:50<05:28,  2.16s/it]

Timeout error for https://www.the-numbers.com/movie/Beauty-and-the-Beast-(2017)#tab=summary


 24%|██▍       | 49/200 [00:58<09:08,  3.63s/it]

Timeout error for https://www.the-numbers.com/movie/Lady-Bird-(2017)#tab=summary


 26%|██▌       | 51/200 [00:58<05:23,  2.17s/it]

Timeout error for https://www.the-numbers.com/movie/Hello-My-Name-is-Doris#tab=summary


 26%|██▌       | 52/200 [00:59<04:38,  1.88s/it]

Timeout error for https://www.the-numbers.com/movie/Queen-of-Katwe#tab=summary


 26%|██▋       | 53/200 [01:03<05:57,  2.43s/it]

Timeout error for https://www.the-numbers.com/movie/Ozzy-(Spain)#tab=summary


 28%|██▊       | 55/200 [01:06<04:33,  1.88s/it]

Timeout error for https://www.the-numbers.com/movie/It-(2017)#tab=summary


 28%|██▊       | 57/200 [01:11<04:57,  2.08s/it]

Timeout error for https://www.the-numbers.com/movie/Power-Rangers-(2017)#tab=summary


 30%|██▉       | 59/200 [01:16<05:10,  2.20s/it]

Timeout error for https://www.the-numbers.com/movie/Ratchet-and-Clank#tab=summary


 30%|███       | 61/200 [01:23<06:41,  2.89s/it]

Timeout error for https://www.the-numbers.com/movie/Coco-(2017)#tab=summary


 31%|███       | 62/200 [01:25<05:27,  2.37s/it]

Timeout error for https://www.the-numbers.com/movie/Loving-(2016)#tab=summary


 32%|███▎      | 65/200 [01:28<03:56,  1.75s/it]

Timeout error for https://www.the-numbers.com/movie/Fate-of-the-Furious-The#tab=summary


 34%|███▍      | 68/200 [01:31<02:49,  1.29s/it]

Timeout error for https://www.the-numbers.com/movie/Shack-The#tab=summary
Timeout error for https://www.the-numbers.com/movie/Jigsaw-(2017)#tab=summary


 36%|███▌      | 71/200 [01:36<03:22,  1.57s/it]

Timeout error for https://www.the-numbers.com/movie/Founding-of-an-Army-The-(2017-China)#tab=summary


 37%|███▋      | 74/200 [01:42<03:24,  1.62s/it]

Timeout error for https://www.the-numbers.com/movie/Shut-In#tab=summary


 38%|███▊      | 76/200 [01:48<04:48,  2.33s/it]

Timeout error for https://www.the-numbers.com/movie/Smurfs-The-Lost-Village#tab=summary


 39%|███▉      | 78/200 [01:50<03:27,  1.70s/it]

Timeout error for https://www.the-numbers.com/movie/Fifty-Shades-Darker#tab=summary


 41%|████      | 82/200 [01:57<04:10,  2.13s/it]

Timeout error for https://www.the-numbers.com/movie/Other-Side-of-the-Door-The#tab=summary


 42%|████▏     | 84/200 [02:01<03:42,  1.92s/it]

Timeout error for https://www.the-numbers.com/movie/Going-in-Style-(2017)#tab=summary


 42%|████▎     | 85/200 [02:05<04:48,  2.51s/it]

Timeout error for https://www.the-numbers.com/movie/Suddenly-Seventeen-(China)#tab=summary


 43%|████▎     | 86/200 [02:07<04:28,  2.36s/it]

Timeout error for https://www.the-numbers.com/movie/My-Little-Pony-The-Movie-(2017)#tab=summary


 44%|████▍     | 88/200 [02:14<05:17,  2.83s/it]

Timeout error for https://www.the-numbers.com/movie/Tri-bogatyrya-i-Morskoy-tsar-(Russia)#tab=summary


 44%|████▍     | 89/200 [02:15<04:14,  2.29s/it]

Timeout error for https://www.the-numbers.com/movie/Mollys-Game#tab=summary


 46%|████▌     | 91/200 [02:16<02:31,  1.39s/it]

Timeout error for https://www.the-numbers.com/movie/Dads-Army#tab=summary


 46%|████▌     | 92/200 [02:17<02:17,  1.27s/it]

Timeout error for https://www.the-numbers.com/movie/Pirates-of-the-Caribbean-Dead-Men-Tell-No-Tales#tab=summary


 47%|████▋     | 94/200 [02:22<03:35,  2.03s/it]

Timeout error for https://www.the-numbers.com/movie/Hologram-for-the-King-A#tab=summary


 48%|████▊     | 97/200 [02:27<02:57,  1.73s/it]

Timeout error for https://www.the-numbers.com/movie/Rudorufu-to-Ippaiattena-(Japan)#tab=summary


 49%|████▉     | 98/200 [02:31<04:04,  2.40s/it]

Timeout error for https://www.the-numbers.com/movie/Hillarys-America-The-Secret-History-of-the-Democratic-Party#tab=summary


 50%|████▉     | 99/200 [02:32<03:20,  1.98s/it]

Timeout error for https://www.the-numbers.com/movie/Geostorm#tab=summary


 50%|█████     | 100/200 [02:37<04:48,  2.89s/it]

Timeout error for https://www.the-numbers.com/movie/Sha-Po-Lang-Tan-Lang-(2017-Hong-Kong)#tab=summary


 50%|█████     | 101/200 [02:41<05:19,  3.22s/it]

Timeout error for https://www.the-numbers.com/movie/Kong-Skull-Island#tab=summary


 52%|█████▏    | 103/200 [02:42<02:54,  1.80s/it]

Timeout error for https://www.the-numbers.com/movie/Happy-Death-Day#tab=summary


 52%|█████▏    | 104/200 [02:44<02:58,  1.86s/it]

Timeout error for https://www.the-numbers.com/movie/Bong-i-Kim-seon-dal-(korea-2016)#tab=summary


 53%|█████▎    | 106/200 [02:48<02:47,  1.78s/it]

Timeout error for https://www.the-numbers.com/movie/Ferdinand#tab=summary


 54%|█████▎    | 107/200 [02:49<02:23,  1.54s/it]

Timeout error for https://www.the-numbers.com/movie/American-Made#tab=summary


 55%|█████▌    | 110/200 [02:58<03:11,  2.13s/it]

Timeout error for https://www.the-numbers.com/movie/Boku-wa-Ashita-Kinou-no-Kimi-to-Deto-Suru-(Japan)#tab=summary


 56%|█████▌    | 112/200 [03:02<03:04,  2.10s/it]

Timeout error for https://www.the-numbers.com/movie/Gong-fu-yu-jia-(China)#tab=summary


 57%|█████▊    | 115/200 [03:07<02:08,  1.51s/it]

Timeout error for https://www.the-numbers.com/movie/Chocolat-(2015)#tab=summary


 60%|█████▉    | 119/200 [03:13<02:08,  1.59s/it]

Timeout error for https://www.the-numbers.com/movie/Friend-Request-(Germany)#tab=summary


 60%|██████    | 121/200 [03:14<01:21,  1.03s/it]

Timeout error for https://www.the-numbers.com/movie/Annabelle-Creation#tab=summary


 66%|██████▌   | 131/200 [03:27<01:26,  1.26s/it]

Timeout error for https://www.the-numbers.com/movie/Neerja#tab=summary


 68%|██████▊   | 135/200 [03:32<01:13,  1.13s/it]

Timeout error for https://www.the-numbers.com/movie/Eiga-Doraemon-Nobita-no-nankyoku-kachikochi-daibouken-(2017-Japan)#tab=summary


 72%|███████▏  | 144/200 [03:41<00:55,  1.01it/s]

Timeout error for https://www.the-numbers.com/movie/Xi-You-Fu-Yao-Pian-(China)#tab=summary


 77%|███████▋  | 154/200 [03:52<00:52,  1.14s/it]

Timeout error for https://www.the-numbers.com/movie/Que-Culpa-Tiene-el-Nino-(Mexico)#tab=summary


 85%|████████▌ | 170/200 [04:05<00:25,  1.17it/s]

Timeout error for https://www.the-numbers.com/movie/Kurosaki-kun-no-iinari-ni-nante-naranai-(Japan)-(2016)#tab=summary


100%|██████████| 200/200 [04:25<00:00,  1.33s/it]


✅ Batch 15 completed: 154 movies saved to Movie Details/movie_details_15.csv


100%|██████████| 200/200 [00:25<00:00,  7.72it/s]


✅ Batch 16 completed: 200 movies saved to Movie Details/movie_details_16.csv


 64%|██████▎   | 127/200 [01:10<00:43,  1.68it/s]

Timeout error for https://www.the-numbers.com/movie/Second-Act-(2018)#tab=summary


100%|██████████| 200/200 [01:58<00:00,  1.68it/s]


✅ Batch 17 completed: 199 movies saved to Movie Details/movie_details_17.csv


 94%|█████████▍| 189/200 [02:06<00:07,  1.44it/s]

Timeout error for https://www.the-numbers.com/movie/Dumbo-(2019)#tab=summary


100%|██████████| 200/200 [02:12<00:00,  1.51it/s]


✅ Batch 18 completed: 199 movies saved to Movie Details/movie_details_18.csv


 22%|██▎       | 45/200 [00:54<11:06,  4.30s/it]

Timeout error for https://www.the-numbers.com/movie/Grudge-The-(2020)#tab=summary


 23%|██▎       | 46/200 [00:58<10:48,  4.21s/it]

Timeout error for https://www.the-numbers.com/movie/Svaha-The-Sixth-Finger-(2019-South-Korea)#tab=summary


 24%|██▎       | 47/200 [00:59<08:16,  3.25s/it]

Timeout error for https://www.the-numbers.com/movie/Cheonmun-Modneunda-Doors-to-Heaven-(S-Korea)#tab=summary


 24%|██▍       | 48/200 [01:01<07:17,  2.88s/it]

Timeout error for https://www.the-numbers.com/movie/Juninin-no-shinitai-kodomotachi-(Japan)-(2019)#tab=summary


 24%|██▍       | 49/200 [01:02<05:49,  2.31s/it]

Timeout error for https://www.the-numbers.com/movie/Nada-a-Perder-2-(Brazil)#tab=summary


 25%|██▌       | 50/200 [01:03<04:47,  1.92s/it]

Timeout error for https://www.the-numbers.com/movie/Countdown-(2019)#tab=summary


 26%|██▌       | 51/200 [01:04<04:04,  1.64s/it]

Timeout error for https://www.the-numbers.com/movie/Dolor-y-gloria-(Spain)-(2019)#tab=summary


 26%|██▌       | 52/200 [01:05<03:35,  1.45s/it]

Timeout error for https://www.the-numbers.com/movie/Kioku-ni-Gozaimasen-(2019-Japan)#tab=summary


 26%|██▋       | 53/200 [01:06<03:13,  1.32s/it]

Timeout error for https://www.the-numbers.com/movie/Shaft-(2019)#tab=summary


 27%|██▋       | 54/200 [01:08<03:41,  1.52s/it]

Timeout error for https://www.the-numbers.com/movie/Fable-The-(Japan)#tab=summary


 40%|███▉      | 79/200 [01:38<02:50,  1.41s/it]

Timeout error for https://www.the-numbers.com/movie/Shou-yi-ren-(2019-China)#tab=summary


 42%|████▏     | 83/200 [01:40<01:27,  1.34it/s]

Timeout error for https://www.the-numbers.com/movie/Shang-Hai-Bao-Lei-(China)#tab=summary


 51%|█████     | 102/200 [01:57<01:12,  1.36it/s]

Timeout error for https://www.the-numbers.com/movie/Charlies-Angels-(2019)#tab=summary


 66%|██████▋   | 133/200 [02:20<01:03,  1.06it/s]

Timeout error for https://www.the-numbers.com/movie/Nan-Fang-Che-Zhan-De-Ju-Hui-(2019-China)#tab=summary


100%|██████████| 200/200 [02:37<00:00,  1.27it/s]


✅ Batch 19 completed: 186 movies saved to Movie Details/movie_details_19.csv


100%|██████████| 200/200 [01:20<00:00,  2.48it/s]


✅ Batch 20 completed: 200 movies saved to Movie Details/movie_details_20.csv


 24%|██▍       | 49/200 [00:42<07:34,  3.01s/it]

Timeout error for https://www.the-numbers.com/movie/Sao-Hei-Xing-Dong-(2022-China)#tab=summary


 30%|███       | 61/200 [00:52<01:44,  1.33it/s]

Timeout error for https://www.the-numbers.com/movie/Beast-(2022)#tab=summary


 33%|███▎      | 66/200 [00:56<02:02,  1.10it/s]

Timeout error for https://www.the-numbers.com/movie/Lightyear-(2022)#tab=summary


 56%|█████▌    | 112/200 [01:32<00:33,  2.64it/s]

Timeout error for https://www.the-numbers.com/movie/Kaiserschmarrndrama-(2020-Germany)#tab=summary


 59%|█████▉    | 118/200 [01:36<00:41,  1.99it/s]

HTTP 403 for https://www.the-numbers.com/movie/Northman-The#tab=summary


100%|██████████| 200/200 [02:21<00:00,  1.42it/s]


✅ Batch 21 completed: 195 movies saved to Movie Details/movie_details_21.csv


 52%|█████▏    | 103/200 [01:02<01:21,  1.19it/s]

Timeout error for https://www.the-numbers.com/movie/Little-Mermaid-The-(2023)#tab=summary


100%|██████████| 200/200 [02:03<00:00,  1.63it/s]


✅ Batch 22 completed: 199 movies saved to Movie Details/movie_details_22.csv


 30%|██▉       | 59/200 [00:40<01:10,  1.99it/s]

Timeout error for https://www.the-numbers.com/movie/Alien-Romulus-(2024)#tab=summary


 82%|████████▎ | 165/200 [01:55<00:21,  1.64it/s]

Timeout error for https://www.the-numbers.com/movie/Inside-Out-2-(2024)#tab=summary


 96%|█████████▌| 192/200 [02:21<00:06,  1.18it/s]

Timeout error for https://www.the-numbers.com/movie/Pig-the-Snake-and-the-Pigeon-The-(2024-Taiwan)#tab=summary


100%|██████████| 200/200 [02:29<00:00,  1.34it/s]


✅ Batch 23 completed: 197 movies saved to Movie Details/movie_details_23.csv


  7%|▋         | 14/200 [00:25<07:59,  2.58s/it]

Timeout error for https://www.the-numbers.com/movie/Frozen-Rage-A-(2024-China)#tab=summary


  8%|▊         | 17/200 [00:30<05:55,  1.94s/it]

Timeout error for https://www.the-numbers.com/movie/All-About-Suomi-(2024-Japan)#tab=summary
Timeout error for https://www.the-numbers.com/movie/Strangers-The-Chapter-2-(2025)#tab=summary


 12%|█▎        | 25/200 [00:41<04:59,  1.71s/it]

Timeout error for https://www.the-numbers.com/movie/Love-Hurts-(2025)#tab=summary


 14%|█▍        | 29/200 [00:44<02:53,  1.01s/it]

Timeout error for https://www.the-numbers.com/movie/Legends-of-the-Condor-Heroes-The-Gallants-(2025-China)#tab=summary


 15%|█▌        | 30/200 [00:48<05:22,  1.90s/it]

Timeout error for https://www.the-numbers.com/movie/Chainsaw-Man-The-Movie-Reze-Arc-(2025-Japan)#tab=summary


 18%|█▊        | 36/200 [01:01<08:01,  2.93s/it]

Timeout error for https://www.the-numbers.com/movie/Ognivo-(2024-Russia)#tab=summary


 18%|█▊        | 37/200 [01:02<06:23,  2.35s/it]

Timeout error for https://www.the-numbers.com/movie/Chosen-The-Last-Supper-Part-1-(2025)#tab=summary


 19%|█▉        | 38/200 [01:07<08:29,  3.15s/it]

Timeout error for https://www.the-numbers.com/movie/Fantastic-Four-The-First-Steps-(2025)#tab=summary


 20%|█▉        | 39/200 [01:09<07:30,  2.80s/it]

Timeout error for https://www.the-numbers.com/movie/Row-to-Win-(2025-China)#tab=summary


 20%|██        | 40/200 [01:10<06:02,  2.26s/it]

Timeout error for https://www.the-numbers.com/movie/Mumu-(2025-China)#tab=summary


 20%|██        | 41/200 [01:13<06:35,  2.49s/it]

Timeout error for https://www.the-numbers.com/movie/Customs-Frontline-(2024-Hong-Kong)#tab=summary


 22%|██▏       | 43/200 [01:15<04:41,  1.79s/it]

Timeout error for https://www.the-numbers.com/movie/Lord-of-the-Rings-The-The-War-of-the-Rohirrim-(2024)#tab=summary


 22%|██▏       | 44/200 [01:18<05:34,  2.15s/it]

Timeout error for https://www.the-numbers.com/movie/HOVERING-BLADE-(2024-China)#tab=summary


 22%|██▏       | 44/200 [01:21<04:47,  1.84s/it]

Timeout error for https://www.the-numbers.com/movie/Conjuring-The-Last-Rites-(2025)#tab=summary
Error for https://www.the-numbers.com/movie/Creation-of-the-Gods-II-Demonic-Confrontation-(2025-China)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Drop-(2025)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Sto-let-tomu-vpered-(2024-Russia)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Monkey-The-(2025)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Suga-Agust-D-Tour-D-Day-The-Movie-(2024-South-Korea)#tab=summary: Session is closed





CancelledError: 

Error for https://www.the-numbers.com/movie/Working-Man-A-(2025)#tab=summary: Session is closed


Error for https://www.the-numbers.com/movie/Black-Bag-(2025)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Open-Door-The-(2025-China)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Day-the-Earth-Blew-Up-The-A-Looney-Tunes-Movie-(2024)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Phoenician-Scheme-The-(2025)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/28-Years-Later-(2025)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/One-of-Them-Days-(2025)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/No-Other-Choice-(2025-South-Korea)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Chosen-The-Season-4-Episodes-1-through-3-(2024)#tab=summary: Session is closed
Error for https://www.the-numbers.com/movie/Mission-Impossible-The-Final-Reckoning-(2025)#tab=summary: Session is closed
Error for https://www.t

## Concatenate to one dataframe & Export to csv 

In [102]:
import glob

def export_all_movie_details():
  all_movie_details = [pd.read_csv(file) for file in glob.glob('Movie Details/*.csv')]
  global movie_details_df
  # Concatenate all series to a dataframe
  if all_movie_details:
    movie_details_df = pd.concat(all_movie_details, axis=0, ignore_index=True)
    move_col = movie_details_df.pop('Release Date')
    movie_details_df.insert(1,'Release Date', move_col)
    
    print("DataFrame shape:", movie_details_df.shape)
    print("\nDataFrame columns:", movie_details_df.columns.tolist())
    print("\nDataFrame content:")
  else:
    movie_details_df = pd.DataFrame()

  movie_details_df.to_csv('movie_details.csv',index=False)

In [103]:
export_all_movie_details()

DataFrame shape: (4827, 9)

DataFrame columns: ['link', 'Release Date', 'Production Budget', 'Running Time', 'Genre', 'Production Method', 'Production/Financing Companies', 'Production Countries', 'Languages']

DataFrame content:


### Re-run timeouts & append to final dataframe:

In [100]:
import pandas as pd
df_WW_all = pd.read_csv("WW_all.csv")
movie_details_df = pd.read_csv("movie_details.csv").dropna(subset=['link'])
# Find differences

async def retry_timeouts():
  timeouts_df = df_WW_all[~df_WW_all['link'].isin(movie_details_df['link'])]
  timeout_urls = [timeouts_df['link'].tolist()[x:x+400] for x in range(0, len(timeouts_df), 400)]
  if not timeout_urls:
    print("No more timeouts :D")
  else:
    print(f"Timeouts: {timeouts_df.shape[0]} missing movies.")
    print(f"Retrying timed out URLs...")
    await process_all_batches(urls=timeout_urls, start_at=91)
    export_all_movie_details()
    await retry_timeouts()

await retry_timeouts()

Timeouts: 1 missing movies.
Retrying timed out URLs...


100%|██████████| 1/1 [00:01<00:00,  1.08s/it]

HTTP 403 for https://www.the-numbers.com/movie/Northman-The#tab=summary
⚠️ Batch 92 completed but no valid data





TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'

## Merge to final `df_WW_all`

In [134]:
import pandas as pd
import numpy as np

df_WW_all = pd.read_csv("WW_all.csv")
movie_details_df = pd.read_csv("movie_details.csv").dropna(subset=['link'])
# MERGE
df = pd.merge(df_WW_all, movie_details_df, on='link', how='right').sort_values(by=['Year Recorded','Rank'])

# COMBINE 2 COLUMNS
df["Distributor_y"] = np.where(
    df["Distributor"].isna() | (df["Distributor"] == ""),  # A2 = ""
    df["Production/Financing Companies"].apply(
        lambda x: x.split(",")[0].strip() if isinstance(x, str) and "," in x else np.nan
    ),
    np.nan
)
df["Distributor_y"] = df["Distributor_y"].fillna(df["Distributor"])
df = df.drop(columns=['Distributor'])

# RENAME
df = df.rename(columns={'Distributor_y': 'Distributor',
                      'Genre_y': 'Genre',
                      'Release Date_y': 'Release Date'})

# REORDER
df = df[['Year Recorded', 'Rank', 'Movie', 'Worldwide Box Office', 'Domestic Box Office', 'International Box Office', 'Domestic Share', 'Distributor', 'Production Budget', 'Running Time', 'Genre', 'Production Method', 'Release Date', 'Production Countries']]
df.to_csv('WW_all_new.csv', index=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4936 entries, 0 to 4935
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Year Recorded             4936 non-null   int64 
 1   Rank                      4936 non-null   int64 
 2   Movie                     4936 non-null   object
 3   Worldwide Box Office      4936 non-null   object
 4   Domestic Box Office       3575 non-null   object
 5   International Box Office  4887 non-null   object
 6   Domestic Share            3575 non-null   object
 7   Distributor               3448 non-null   object
 8   Production Budget         2781 non-null   object
 9   Running Time              4571 non-null   object
 10  Genre                     4853 non-null   object
 11  Production Method         4839 non-null   object
 12  Release Date              4778 non-null   object
 13  Production Countries      4890 non-null   object
dtypes: int64(2), object(12)
memor