# Webcrawling process (cont') - đang bỏ dở, có thể không làm

In [8]:
import pandas as pd
import requests
import warnings
# warnings.filterwarnings("ignore")

In [9]:
df_WW_all = pd.read_csv("WW_all.csv")
df_WW_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4829 entries, 0 to 4828
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Movie                       4829 non-null   object
 1   link                        4829 non-null   object
 2   Worldwide Box Office        4829 non-null   object
 3   Domestic Box Office         3496 non-null   object
 4   International Box Office    4786 non-null   object
 5   Domestic Share              3496 non-null   object
 6   Share Of Number One Market  2868 non-null   object
 7   Number One Market           2955 non-null   object
 8   Release Date                3003 non-null   object
 9   Distributor                 2980 non-null   object
 10  Genre                       3004 non-null   object
 11  Rank                        4829 non-null   int64 
 12  Year Recorded               4829 non-null   int64 
dtypes: int64(2), object(11)
memory usage: 490.6+ KB


## Get Movie Details of each film
This is the hardest part, not only does it takes time but there is also a risk of being temporarily/permanently blocked by the site (Error 403 Forbidden)...

### List of browers to rotate

In [10]:
HEADERS_LIST = [
  {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:110.0) Gecko/20100101 Firefox/110.0",
    "X-Requested-With": "XMLHttpRequest"
  }
]

### Old: Parallel Webscraping - 20s/movie

In [11]:
from bs4 import BeautifulSoup
import re
from datetime import datetime
import random

def scrape_movie_details(url):
  """
  Scrape the Movie Details section from the-numbers.com
  Returns a dictionary with the structured data
  """
  header = random.choices(HEADERS_LIST)
  
  s = requests.Session()
  r = s.get(url, headers=header, timeout=15)
  soup = BeautifulSoup(r.text, 'html.parser')
  
  # Find the Movie Details section
  movie_details = {'link': url}
  
  # Look for the table with Movie Details
  details_table = soup.find('table', {'class': 'movie-details'})
  if not details_table:
    # Alternative: look for the section by text content
    details_section = soup.find('h2', string='Movie Details')
    if details_section:
      details_table = details_section.find_next('table')
  
  if details_table:
    rows = details_table.find_all('tr')
    for row in rows:
      cells = row.find_all(['td', 'th'])
      if len(cells) >= 2:
          key = cells[0].get_text(strip=True).replace('\xa0', ' ')
          value = cells[1].get_text(strip=True).replace('\xa0', ' ')
          
          # Clean up the key (remove colons and extra spaces)
          key = key.replace(':', '').strip()
          
          # Skip unwanted fields completely
          if key in ['Video Release', 'MPAA Rating', 'Franchise', 'Comparisons']:
              continue
          
          # Handle Production Countries and Languages separately
          if key == 'Production Countries':
              # Check if Languages data is mixed in
              if 'Languages:' in value:
                  parts = value.split('Languages:')
                  movie_details['Production Countries'] = parts[0].strip()
                  if len(parts) > 1:
                      movie_details['Languages'] = parts[1].strip()
              else:
                  movie_details['Production Countries'] = value
          elif key == 'Languages':
              movie_details['Languages'] = value
          else:
              # Store all other fields
              movie_details[key] = value
  
  # Extract earliest release date from Domestic and International releases
  release_dates = []
  
  # Extract dates from Domestic Releases
  if 'Domestic Releases' in movie_details:
      domestic_text = movie_details['Domestic Releases']
      # Look for date patterns like "February 14th, 2025"
      domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
      release_dates.extend(domestic_dates)
  
  # Extract dates from International Releases
  if 'International Releases' in movie_details:
      intl_text = movie_details['International Releases']
      # Look for date patterns like "January 29th, 2025"
      intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
      release_dates.extend(intl_dates)
  
  # Find the earliest date
  if release_dates:
      try:
          # Convert dates to datetime objects for comparison
          parsed_dates = []
          for date_str in release_dates:
              try:
                  # Handle ordinal suffixes (st, nd, rd, th)
                  clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                  parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                  parsed_dates.append(parsed_date)
              except:
                  continue
          
          if parsed_dates:
              earliest_date = min(parsed_dates)
              movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
      except:
          pass
  
  # Remove the original release fields since we now have Release Date
  movie_details.pop('Domestic Releases', None)
  movie_details.pop('International Releases', None)
  
  return movie_details

### New: Async webscraping - 1s/movie

In [12]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import re
from datetime import datetime
import random
from tqdm.asyncio import tqdm

max_concurrency = 10
sem = asyncio.Semaphore(max_concurrency)
timeout_urls = []

async def scrape_movie_details_async(session, header, url):
  """
  Async scraping of movie details.
  """
  async with sem:
    try:
      async with session.get(url, headers=header, timeout=25) as r:
        soup = BeautifulSoup(await r.text(), 'html.parser')
        # Find the Movie Details section
        movie_details = {'link': url}
        # Look for the table with Movie Details
        details_table = soup.find('table', {'class': 'movie-details'})
        if not details_table:
          # Alternative: look for the section by text content
          details_section = soup.find('h2', string='Movie Details')
          if details_section:
            details_table = details_section.find_next('table')
        
        if details_table:
          rows = details_table.find_all('tr')
          for row in rows:
            cells = row.find_all(['td', 'th'])
            if len(cells) >= 2:
                key = cells[0].get_text(strip=True).replace('\xa0', ' ')
                value = cells[1].get_text(strip=True).replace('\xa0', ' ')
                
                # Clean up the key (remove colons and extra spaces)
                key = key.replace(':', '').strip()
                
                # Skip unwanted fields completely
                if key in ['Video Release', 'MPAA Rating', 'Franchise', 'Comparisons']:
                    continue
                
                # Handle Production Countries and Languages separately
                if key == 'Production Countries':
                    # Check if Languages data is mixed in
                    if 'Languages:' in value:
                        parts = value.split('Languages:')
                        movie_details['Production Countries'] = parts[0].strip()
                        if len(parts) > 1:
                            movie_details['Languages'] = parts[1].strip()
                    else:
                        movie_details['Production Countries'] = value
                elif key == 'Languages':
                    movie_details['Languages'] = value
                else:
                    # Store all other fields
                    movie_details[key] = value
        
        # Extract earliest release date from Domestic and International releases
        release_dates = []
        
        # Extract dates from Domestic Releases
        if 'Domestic Releases' in movie_details:
            domestic_text = movie_details['Domestic Releases']
            # Look for date patterns like "February 14th, 2025"
            domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
            release_dates.extend(domestic_dates)
        
        # Extract dates from International Releases
        if 'International Releases' in movie_details:
            intl_text = movie_details['International Releases']
            # Look for date patterns like "January 29th, 2025"
            intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
            release_dates.extend(intl_dates)
        
        # Find the earliest date
        if release_dates:
            try:
                # Convert dates to datetime objects for comparison
                parsed_dates = []
                for date_str in release_dates:
                    try:
                        # Handle ordinal suffixes (st, nd, rd, th)
                        clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                        parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                        parsed_dates.append(parsed_date)
                    except:
                        continue
                
                if parsed_dates:
                    earliest_date = min(parsed_dates)
                    movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
            except:
                pass
        
        # Remove the original release fields since we now have Release Date
        movie_details.pop('Domestic Releases', None)
        movie_details.pop('International Releases', None)
        return movie_details
    
    except asyncio.TimeoutError:
        print(f"Timeout error for {url}")
        return {}
    except Exception as e:
        print(f"Error for {url}: {e}")
        return {}

async def scrape_batch(session, header, urls):
  """Async scraping, continued - callable function."""
  if session is None:
    async with aiohttp.ClientSession() as session:
      tasks = [scrape_movie_details_async(session, header, url) for url in urls]
      return await tqdm.gather(*tasks)
  else:
    tasks = [scrape_movie_details_async(session, header, url) for url in urls]
    return await tqdm.gather(*tasks)

In [20]:
all_movie_details = []
urls = [df_WW_all['link'][x:x+200] for x in range(0, len(df_WW_all), 200)]

async def process_batch_with_error_handling(session, header, urls, batch_num):
    """Process one batch with error handling"""
    try:
        print(f"========== PROCESSING BATCH {batch_num:02d} ({len(urls)} URLs)... ==========")
        results = await scrape_batch(session, header, urls)
        
        # Filter out None results (failed scrapes)
        valid_results = [r for r in results if r is not None]
        
        if valid_results:
            # Convert to DataFrame
            df = pd.DataFrame(valid_results)
            
            # Save to CSV
            filename = f'Movie Details/movie_details_{batch_num:02d}.csv'
            df.to_csv(filename, index=False)
            print(f"✅ Batch {batch_num:02d} completed: {len(valid_results)} movies saved to {filename}")
            return len(valid_results)
        else:
            print(f"⚠️ Batch {batch_num:02d} completed but no valid data")
            return 0
            
    except Exception as e:
        print(f"❌ Error in batch {batch_num:02d}: {e}")
        return 0

async def process_all_batches(start_at=int):
    """Process all batches with error handling"""
    total_processed = 0

    async with aiohttp.ClientSession() as session:
        batch_num = start_at - 1
        for batch_urls in urls[start_at:]:
            batch_num += 1
            i = (batch_num - 1) // 2 % len(HEADERS_LIST)
            header = HEADERS_LIST[i]
            processed_count = await process_batch_with_error_handling(session, header, batch_urls, batch_num)
            total_processed += processed_count
            
            # Small delay between batches to be nice to the server
            await asyncio.sleep(1)
    
    print(f"\n🎉 All batches completed! Total movies processed: {total_processed}")

In [14]:
# Run the batch processing
# await process_all_batches(start_at=1)

## Concatenate to one dataframe -> Export to csv 

In [15]:
import glob

def export_all_movie_details():
  all_movie_details = [pd.read_csv(file) for file in glob.glob('Movie Details/*.csv')]
  global movie_details_df
  # Concatenate all series to a dataframe
  if all_movie_details:
    movie_details_df = pd.concat(all_movie_details, axis=0, ignore_index=True)
    move_col = movie_details_df.pop('Release Date')
    movie_details_df.insert(1,'Release Date', move_col)
    
    print("DataFrame shape:", movie_details_df.shape)
    print("\nDataFrame columns:", movie_details_df.columns.tolist())
    print("\nDataFrame content:")
  else:
    movie_details_df = pd.DataFrame()

  movie_details_df.to_csv('movie_details.csv',index=False)
  return movie_details_df

### Re-run timeouts & append to final dataframe:

In [31]:
import pandas as pd
df_WW_all = pd.read_csv("WW_all.csv")
movie_details_df = pd.read_csv("movie_details.csv").dropna(subset=['link'])

timeouts_df = df_WW_all[~df_WW_all['link'].isin(movie_details_df['link'])]
timeout_urls = [timeouts_df['link'][x:x+200] for x in range(0, len(timeouts_df), 200)]

if timeout_urls:
  print("DataFrame shape:", timeouts_df.shape)
  print("\nDataFrame columns:", timeouts_df.columns.tolist())
  urls = urls + timeout_urls

urls[25]

DataFrame shape: (1, 13)

DataFrame columns: ['Movie', 'link', 'Worldwide Box Office', 'Domestic Box Office', 'International Box Office', 'Domestic Share', 'Share Of Number One Market', 'Number One Market', 'Release Date', 'Distributor', 'Genre', 'Rank', 'Year Recorded']


2936    https://www.the-numbers.com/movie/Shape-of-Wat...
Name: link, dtype: object

In [None]:
# print(f"Retrying timed out URLs...")
# await process_all_batches(start_at=25)

In [32]:
export_all_movie_details()

DataFrame shape: (4910, 11)

DataFrame columns: ['link', 'Release Date', 'Running Time', 'Keywords', 'Source', 'Genre', 'Production Method', 'Creative Type', 'Production/Financing Companies', 'Production Countries', 'Languages']

DataFrame content:


Unnamed: 0,link,Release Date,Running Time,Keywords,Source,Genre,Production Method,Creative Type,Production/Financing Companies,Production Countries,Languages
0,https://www.the-numbers.com/movie/Star-Wars-Ep...,"May 18, 2005",139 minutes,"Visual Effects,Good vs. Evil,Cyborg,Cloning,Wa...",Original Screenplay,Adventure,Animation/Live Action,Science Fiction,Lucasfilm,United States,English
1,https://www.the-numbers.com/movie/Harry-Potter...,"November 18, 2005",150 minutes,"Boarding School,Visual Effects,IMAX: DMR,Famil...",Based on Fiction Book/Short Story,Adventure,Animation/Live Action,Fantasy,"Warner Bros.,Heyday Films","United Kingdom,United States",English
2,https://www.the-numbers.com/movie/Chronicles-o...,"December 09, 2005",140 minutes,"Talking Animals,Visual Effects,Alternative Dim...",Based on Fiction Book/Short Story,Adventure,Animation/Live Action,Fantasy,"Walt Disney Pictures,Walden Media,Mark Johnson","United States,United Kingdom",English
3,https://www.the-numbers.com/movie/War-of-the-W...,"June 24, 2005",116 minutes,"Alien Invasion,Visual Effects,Voiceover/Narrat...",Based on Fiction Book/Short Story,Action,Animation/Live Action,Science Fiction,"Paramount Pictures,DreamWorks Pictures,Amblin ...",United States,English
4,https://www.the-numbers.com/movie/King-Kong-(2...,"December 14, 2005",189 minutes,"Animals Gone Bad,Creature Feature,Visual Effec...",Original Screenplay,Adventure,Animation/Live Action,Fantasy,Wingnut Films,"New Zealand,United States",English
...,...,...,...,...,...,...,...,...,...,...,...
4905,https://www.the-numbers.com/movie/Io-Sono-La-F...,09-Jan-25,,,Original Screenplay,Comedy,Live Action,Contemporary Fiction,,Italy,Italian
4906,https://www.the-numbers.com/movie/Three-Kingdo...,01-Oct-25,,,Original Screenplay,Drama,Digital Animation,Contemporary Fiction,,China,Mandarin
4907,https://www.the-numbers.com/movie/Na-derevnyu-...,12-Jun-25,,,Original Screenplay,Comedy,Live Action,Contemporary Fiction,,Russian Federation,Russian
4908,https://www.the-numbers.com/movie/Dracula-A-Lo...,30-Jul-25,129 minutes,"1400s,Romance",Based on Fiction Book/Short Story,Horror,Live Action,Historical Fiction,,"France,United Kingdom",English


## Merge to final `df_WW_all`

In [1]:
import pandas as pd

df_WW_all = pd.read_csv("WW_all.csv")
movie_details_df = pd.read_csv("movie_details.csv").dropna(subset=['link']).sort_values(by=['Year Recorded','Rank'])
df_WW_all_new = pd.merge(df_WW_all, movie_details_df, on='link', how='right')
df_WW_all_new.to_csv('WW_all_new.csv', index=False)
df_WW_all_new.info()

KeyError: 'Year Recorded'

In [9]:
df_WW_all_new.head()

Unnamed: 0,Movie,link,Worldwide Box Office,Domestic Box Office,International Box Office,Domestic Share,Share Of Number One Market,Number One Market,Release Date_x,Distributor,...,Release Date_y,Running Time,Keywords,Source,Genre_y,Production Method,Creative Type,Production/Financing Companies,Production Countries,Languages
0,Star Wars Ep. III: Revenge of the Sith,https://www.the-numbers.com/movie/Star-Wars-Ep...,"$902,891,983","$414,378,291","$488,513,692",45.89%,10.6%,United Kingdom,"May 19, 2005",20th Century Fox,...,"May 18, 2005",139 minutes,"Visual Effects,Good vs. Evil,Cyborg,Cloning,Wa...",Original Screenplay,Adventure,Animation/Live Action,Science Fiction,Lucasfilm,United States,English
1,Harry Potter and the Goblet of Fire,https://www.the-numbers.com/movie/Harry-Potter...,"$885,923,981","$291,147,424","$594,776,557",32.86%,15.7%,Japan,"Nov 18, 2005",Warner Bros.,...,"November 18, 2005",150 minutes,"Boarding School,Visual Effects,IMAX: DMR,Famil...",Based on Fiction Book/Short Story,Adventure,Animation/Live Action,Fantasy,"Warner Bros.,Heyday Films","United Kingdom,United States",English
2,"The Chronicles of Narnia: The Lion, the Witch a…",https://www.the-numbers.com/movie/Chronicles-o...,"$720,539,572","$291,710,957","$428,828,615",40.49%,0.0%,New Zealand,"Dec 9, 2005",Walt Disney,...,"December 09, 2005",140 minutes,"Talking Animals,Visual Effects,Alternative Dim...",Based on Fiction Book/Short Story,Adventure,Animation/Live Action,Fantasy,"Walt Disney Pictures,Walden Media,Mark Johnson","United States,United Kingdom",English
3,War of the Worlds,https://www.the-numbers.com/movie/War-of-the-W...,"$606,836,535","$234,280,354","$372,556,181",38.61%,3.5%,Australia,"Jun 29, 2005",Paramount Pictures,...,"June 24, 2005",116 minutes,"Alien Invasion,Visual Effects,Voiceover/Narrat...",Based on Fiction Book/Short Story,Action,Animation/Live Action,Science Fiction,"Paramount Pictures,DreamWorks Pictures,Amblin ...",United States,English
4,King Kong,https://www.the-numbers.com/movie/King-Kong-(2...,"$556,906,378","$218,080,025","$338,826,353",39.16%,0.0%,New Zealand,"Dec 14, 2005",Universal,...,"December 14, 2005",189 minutes,"Animals Gone Bad,Creature Feature,Visual Effec...",Original Screenplay,Adventure,Animation/Live Action,Fantasy,Wingnut Films,"New Zealand,United States",English
