# Webcrawling process (cont') - ƒëang b·ªè d·ªü, c√≥ th·ªÉ kh√¥ng l√†m

In [1]:
import pandas as pd
import requests
import warnings
warnings.filterwarnings("ignore")

## Get Movie Details of each film
This is the hardest part, not only does it takes time but there is also a risk of being temporarily/permanently blocked by the site (Error 403 Forbidden)...

### List of browers to rotate

In [2]:
HEADERS_LIST = [
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:110.0) Gecko/20100101 Firefox/110.0",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",
    "X-Requested-With": "XMLHttpRequest"
  },
]

### Old: Parallel Webscraping - 20s/movie

In [3]:
from bs4 import BeautifulSoup
import re
from datetime import datetime
import random

def parse_movie_details(soup, url):
  """
  Shared parsing logic for movie details from BeautifulSoup object.
  Returns a dictionary with the structured data.
  """
  # Find the Movie Details section
  movie_details = {'link': url}
  
  # Get table #1 (index 1) - the metrics table
  production_budget = ''
  all_tables = soup.find_all('table', limit=4)
  metrics_table = all_tables[1]
  
  # Search all rows in table #1 for "Production Budget"
  rows = metrics_table.find_all('tr')
  for row in rows:
    cells = row.find_all(['td', 'th'])
    for cell in (cells):
      text = cell.get_text(strip=True)
      # Check if this cell contains "Production Budget"
      if 'production budget' in text.lower():
        production_budget = re.search(r'\$?([\d,]+)(?![\d,])', text).group(0)
        movie_details['Production Budget'] = production_budget
  
  # Look for the table with Movie Details
  details_table = all_tables[3]
  
  if details_table:
    rows = details_table.find_all('tr')
    for row in rows:
      cells = row.find_all(['td', 'th'])
      if len(cells) >= 2:
          key = cells[0].get_text(strip=True).replace('\xa0', ' ')
          value = cells[1].get_text(strip=True).replace('\xa0', ' ')
          
          # Clean up the key (remove colons and extra spaces)
          key = key.replace(':', '').strip()
          
          # Skip unwanted fields completely
          if key in ['Video Release', 'Comparisons', 'Keywords', 'Source', 'Languages']:
              continue
          
          # MPAA Rating:
          if key == 'MPAA Rating':
            movie_details['MPAA Rating'] = re.findall(r'^[A-Z0-9-]+', value, flags=re.MULTILINE)

          # Handle Production Countries and Languages separately
          if key == 'Production Countries':
              # Check if Languages data is mixed in
              if 'Languages:' in value:
                  parts = value.split('Languages:')
                  movie_details['Production Countries'] = parts[0].strip()
                  if len(parts) > 1:
                      movie_details['Languages'] = parts[1].strip()
              else:
                  movie_details['Production Countries'] = value
          elif key == 'Languages':
              movie_details['Languages'] = value
          else:
              # Store all other fields
              movie_details[key] = value
  
  # Extract earliest release date from Domestic and International releases
  release_dates = []
  
  # Extract dates from Domestic Releases
  if 'Domestic Releases' in movie_details:
      domestic_text = movie_details['Domestic Releases']
      # Look for date patterns like "February 14th, 2025"
      domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
      release_dates.extend(domestic_dates)
  
  # Extract dates from International Releases
  if 'International Releases' in movie_details:
      intl_text = movie_details['International Releases']
      # Look for date patterns like "January 29th, 2025"
      intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
      release_dates.extend(intl_dates)
  
  # Find the earliest date
  if release_dates:
      try:
          # Convert dates to datetime objects for comparison
          parsed_dates = []
          for date_str in release_dates:
              try:
                  # Handle ordinal suffixes (st, nd, rd, th)
                  clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                  parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                  parsed_dates.append(parsed_date)
              except:
                  continue
          
          if parsed_dates:
              earliest_date = min(parsed_dates)
              movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
      except:
          pass
  
  # Remove the original release fields since we now have Release Date
  movie_details.pop('Domestic Releases', None)
  movie_details.pop('International Releases', None)
  
  return movie_details

In [4]:
def scrape_movie_details(url):
  """
  Scrape the Movie Details section from the-numbers.com
  Returns a dictionary with the structured data
  """
  header = random.choice(HEADERS_LIST)  # Use choice() not choices()
  
  s = requests.Session()
  r = s.get(url, headers=header, timeout=15)
  soup = BeautifulSoup(r.text, 'html.parser')
  
  # Use the shared parsing logic
  return parse_movie_details(soup, url)

### New: Async webscraping - 1s/movie

In [5]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm

max_concurrency = 10
sem = asyncio.Semaphore(max_concurrency)
timeout_urls = []

async def scrape_movie_details_async(session, header, url):
  """
  Async scraping of movie details.
  """
  async with sem:
    try:
      async with session.get(url, headers=header, timeout=25) as r:
        status = r.status
        if status != 200:
          print(f"HTTP {status} for {url}")
          return None
        html = await r.text()
        soup = BeautifulSoup(html, 'html.parser')
        
        # Use the shared parsing logic
        data = parse_movie_details(soup, url)
        # Treat dicts with only 'link' as invalid
        if isinstance(data, dict) and len(data) <= 1:
          print(f"Parsed no data for {url}")
          return None
        return data
    
    except asyncio.TimeoutError:
        print(f"Timeout error for {url}")
        return None
    except Exception as e:
        print(f"Error for {url}: {e}")
        return None

async def scrape_batch(session, header, urls):
  """Async scraping, continued - callable function."""
  if session is None:
    async with aiohttp.ClientSession() as session:
      tasks = [scrape_movie_details_async(session, header, url) for url in urls]
      return await tqdm.gather(*tasks)
  else:
    tasks = [scrape_movie_details_async(session, header, url) for url in urls]
    return await tqdm.gather(*tasks)

In [6]:
async def process_batch_with_error_handling(session, header, urls, batch_num):
  """Process one batch with error handling"""
  try:
    print(f"========== PROCESSING BATCH {batch_num:02d} ({len(urls)} URLs)... ==========")

    results = await scrape_batch(session, header, urls)
    
    # Keep only non-empty dicts with more than just the link
    valid_results = []
    for r in results:
      if isinstance(r, dict) and len(r) > 1:
        valid_results.append(r)
    
    if valid_results:
      # Convert to DataFrame
      df = pd.DataFrame(valid_results)
      
      # Save to CSV
      filename = f'Movie Details/movie_details_{batch_num:02d}.csv'
      df.to_csv(filename, index=False)
      print(f"‚úÖ Batch {batch_num:02d} completed: {len(valid_results)} movies saved to {filename}")
      return len(valid_results)
    else:
      print(f"‚ö†Ô∏è Batch {batch_num:02d} completed but no valid data")
      return None
            
  except Exception as e:
    print(f"‚ùå Error in batch {batch_num:02d}: {e}")
    return None

async def process_all_batches(urls, start_at=0):
  """Process all batches with error handling.
  Accepts:
    - string URL ‚Üí one batch with one URL
    - list[str]  ‚Üí one batch with many URLs
    - list[list[str]] ‚Üí multiple batches (original behavior)
  """
  # Normalize input into list of batches (list[list[str]])
  if isinstance(urls, str):
    batches = [[urls]]
  elif isinstance(urls, list):
    if len(urls) == 0:
      batches = []
    elif all(isinstance(u, str) for u in urls):
      batches = [urls]
    else:
      batches = urls
  else:
    batches = []
  
  # Clamp start_at
  if start_at is None or not isinstance(start_at, int):
    start_at = 0
  if start_at < 0:
    start_at = 0
  if 90 > start_at >= len(batches) > 0:
    start_at = len(batches) - 1
  if start_at == 90:
    start_at = 90
  
  total_processed = 0

  async with aiohttp.ClientSession() as session:
    batch_num = start_at
    for batch_urls in batches:
      batch_num += 1
      i = (batch_num - 1) // 2 % len(HEADERS_LIST)
      header = HEADERS_LIST[i]
      processed_count = await process_batch_with_error_handling(session, header, batch_urls, batch_num)
      total_processed += processed_count
      
      # Small delay between batches to be nice to the server
      await asyncio.sleep(1)
  
  print(f"\nüéâ All batches completed! Total movies processed: {total_processed}")

In [7]:
import pandas as pd
df_WW_all = pd.read_csv("WW_all.csv")
all_movie_details = []
links = [df_WW_all['link'].tolist()[x:x+200] for x in range(0, len(df_WW_all), 200)]

In [8]:
links_to_scrape = links
isinstance(links_to_scrape, list)

True

In [9]:
# scrape_movie_details(links_to_scrape)

In [10]:
await process_all_batches(urls=links_to_scrape)



  0%|          | 0/200 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:51<00:00,  3.87it/s]


‚úÖ Batch 01 completed: 200 movies saved to Movie Details/movie_details_01.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:47<00:00,  4.20it/s]


‚úÖ Batch 02 completed: 200 movies saved to Movie Details/movie_details_02.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:44<00:00,  4.45it/s]


‚úÖ Batch 03 completed: 200 movies saved to Movie Details/movie_details_03.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:51<00:00,  3.88it/s]


‚úÖ Batch 04 completed: 200 movies saved to Movie Details/movie_details_04.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:59<00:00,  3.36it/s]


‚úÖ Batch 05 completed: 200 movies saved to Movie Details/movie_details_05.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:49<00:00,  4.06it/s]


‚úÖ Batch 06 completed: 200 movies saved to Movie Details/movie_details_06.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:49<00:00,  4.05it/s]


‚úÖ Batch 07 completed: 200 movies saved to Movie Details/movie_details_07.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:44<00:00,  4.45it/s]


‚úÖ Batch 08 completed: 200 movies saved to Movie Details/movie_details_08.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:36<00:00,  5.41it/s]


‚úÖ Batch 09 completed: 200 movies saved to Movie Details/movie_details_09.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:50<00:00,  3.96it/s]


‚úÖ Batch 10 completed: 200 movies saved to Movie Details/movie_details_10.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:45<00:00,  4.43it/s]


‚úÖ Batch 11 completed: 200 movies saved to Movie Details/movie_details_11.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:49<00:00,  4.07it/s]


‚úÖ Batch 12 completed: 200 movies saved to Movie Details/movie_details_12.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:39<00:00,  5.04it/s]


‚úÖ Batch 13 completed: 200 movies saved to Movie Details/movie_details_13.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:43<00:00,  4.61it/s]


‚úÖ Batch 14 completed: 200 movies saved to Movie Details/movie_details_14.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:44<00:00,  4.54it/s]


‚úÖ Batch 15 completed: 200 movies saved to Movie Details/movie_details_15.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:30<00:00,  6.63it/s]


‚úÖ Batch 16 completed: 200 movies saved to Movie Details/movie_details_16.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:56<00:00,  3.52it/s]


‚úÖ Batch 17 completed: 200 movies saved to Movie Details/movie_details_17.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:41<00:00,  4.83it/s]


‚úÖ Batch 18 completed: 200 movies saved to Movie Details/movie_details_18.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:37<00:00,  5.39it/s]


‚úÖ Batch 19 completed: 200 movies saved to Movie Details/movie_details_19.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:37<00:00,  5.35it/s]


‚úÖ Batch 20 completed: 200 movies saved to Movie Details/movie_details_20.csv


 36%|‚ñà‚ñà‚ñà‚ñå      | 72/200 [00:12<00:14,  8.68it/s]

HTTP 403 for https://www.the-numbers.com/movie/Northman-The#tab=summary


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:33<00:00,  5.96it/s]


‚úÖ Batch 21 completed: 199 movies saved to Movie Details/movie_details_21.csv


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 171/200 [00:58<01:01,  2.12s/it]

Error for https://www.the-numbers.com/movie/Ce-ancora-domani-(2023-Italy)#tab=summary: list index out of range


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 175/200 [00:59<00:32,  1.30s/it]

Timeout error for https://www.the-numbers.com/movie/She-Said-(2022)#tab=summary


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [01:14<00:00,  2.68it/s]


‚úÖ Batch 22 completed: 198 movies saved to Movie Details/movie_details_22.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:37<00:00,  5.34it/s]


‚úÖ Batch 23 completed: 200 movies saved to Movie Details/movie_details_23.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [00:37<00:00,  5.32it/s]


‚úÖ Batch 24 completed: 200 movies saved to Movie Details/movie_details_24.csv


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40/40 [00:05<00:00,  7.73it/s]


‚úÖ Batch 25 completed: 40 movies saved to Movie Details/movie_details_25.csv

üéâ All batches completed! Total movies processed: 4837


## Concatenate to one dataframe & Export to csv 

In [11]:
import glob

def export_all_movie_details():
  all_movie_details = [pd.read_csv(file) for file in glob.glob('Movie Details/*.csv')]
  global movie_details_df
  # Concatenate all series to a dataframe
  if all_movie_details:
    movie_details_df = pd.concat(all_movie_details, axis=0, ignore_index=True)
    move_col = movie_details_df.pop('Release Date')
    movie_details_df.insert(1,'Release Date', move_col)
    
    print("DataFrame shape:", movie_details_df.shape)
    print("\nDataFrame columns:", movie_details_df.columns.tolist())
    print("\nDataFrame content:")
  else:
    movie_details_df = pd.DataFrame()

  movie_details_df.to_csv('movie_details.csv',index=False)

In [12]:
export_all_movie_details()

DataFrame shape: (5158, 12)

DataFrame columns: ['link', 'Release Date', 'Production Budget', 'MPAA Rating', 'Running Time', 'Franchise', 'Genre', 'Production Method', 'Creative Type', 'Production/Financing Companies', 'Production Countries', 'Languages']

DataFrame content:


### Re-run timeouts & append to final dataframe:

In [14]:
import pandas as pd
df_WW_all = pd.read_csv("WW_all.csv")
movie_details_df = pd.read_csv("movie_details.csv").dropna(subset=['link'])
# Find differences

async def retry_timeouts(retry_start=91):
  timeouts_df = df_WW_all[~df_WW_all['link'].isin(movie_details_df['link'])]
  timeout_urls = [timeouts_df['link'].tolist()[x:x+400] for x in range(0, len(timeouts_df), 400)]
  if not timeout_urls:
    print("No more timeouts :D")
  else:
    print(f"Timeouts: {timeouts_df.shape[0]} missing movies.")
    print(f"Retrying timed out URLs...")
    await process_all_batches(urls=timeout_urls, start_at=retry_start)
    export_all_movie_details()
    await retry_timeouts(retry_start + 1)

await retry_timeouts()

Timeouts: 1 missing movies.
Retrying timed out URLs...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.23it/s]

HTTP 403 for https://www.the-numbers.com/movie/Northman-The#tab=summary
‚ö†Ô∏è Batch 92 completed but no valid data





TypeError: unsupported operand type(s) for +=: 'int' and 'NoneType'

## Merge to final `df_WW_all`

In [15]:
import pandas as pd
import numpy as np
import re

df_WW_all = pd.read_csv("WW_all.csv")
movie_details_df = pd.read_csv("movie_details.csv").dropna(subset=['link'])
# MERGE
df = pd.merge(df_WW_all, movie_details_df, on='link', how='right').sort_values(by=['Year Recorded','Rank'])

# CLEAN MPAA RATING - extract only the rating (PG, PG-13, R, G, etc.)
if 'MPAA Rating' in df.columns:
    df['MPAA Rating'] = df['MPAA Rating'].apply(
        lambda x: re.match(r'^([A-Z]+(?:-[0-9]+)?)', str(x)).group(1) 
        if pd.notna(x) and isinstance(x, str) and re.match(r'^([A-Z]+(?:-[0-9]+)?)', str(x)) 
        else x
    )

# COMBINE 2 COLUMNS
df["Distributor_y"] = np.where(
    df["Distributor"].isna() | (df["Distributor"] == ""),  # A2 = ""
    df["Production/Financing Companies"].apply(
        lambda x: x.split(",")[0].strip() if isinstance(x, str) and "," in x else np.nan
    ),
    np.nan
)
df["Distributor_y"] = df["Distributor_y"].fillna(df["Distributor"])
df = df.drop(columns=['Distributor'])

# RENAME
df = df.rename(columns={'Distributor_y': 'Distributor',
                      'Genre_y': 'Genre',
                      'Release Date_y': 'Release Date'})

# REORDER
df = df[['Year Recorded', 'Rank', 'Movie', 'Worldwide Box Office', 'Domestic Box Office', 'International Box Office', 'Domestic Share', 'Distributor', 'Production Budget', 'Running Time', 'Genre', 'Production Method', 'Creative Type', 'MPAA Rating', 'Franchise', 'Production Countries', 'Release Date']]
df.to_csv('WW_all_new.csv', index=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5306 entries, 0 to 5303
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Year Recorded             5306 non-null   int64 
 1   Rank                      5306 non-null   int64 
 2   Movie                     5306 non-null   object
 3   Worldwide Box Office      5306 non-null   object
 4   Domestic Box Office       3826 non-null   object
 5   International Box Office  5246 non-null   object
 6   Domestic Share            3826 non-null   object
 7   Distributor               3686 non-null   object
 8   Production Budget         2930 non-null   object
 9   Running Time              4876 non-null   object
 10  Genre                     5218 non-null   object
 11  Production Method         5204 non-null   object
 12  Creative Type             4802 non-null   object
 13  MPAA Rating               3710 non-null   object
 14  Franchise                 129