# Webcrawling process (cont') - đang bỏ dở, có thể không làm

In [43]:
import pandas as pd
import requests
import warnings
# warnings.filterwarnings("ignore")

In [44]:
df_WW_all = pd.read_csv("WW_all.csv")
df_WW_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4829 entries, 0 to 4828
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Movie                       4829 non-null   object
 1   link                        4829 non-null   object
 2   Worldwide Box Office        4829 non-null   object
 3   Domestic Box Office         3496 non-null   object
 4   International Box Office    4786 non-null   object
 5   Domestic Share              3496 non-null   object
 6   Share Of Number One Market  2868 non-null   object
 7   Number One Market           2955 non-null   object
 8   Release Date                3003 non-null   object
 9   Distributor                 2980 non-null   object
 10  Genre                       3004 non-null   object
 11  Rank                        4829 non-null   int64 
 12  Year Recorded               4829 non-null   int64 
dtypes: int64(2), object(11)
memory usage: 490.6+ KB


## Get Movie Details of each film
This is the hardest part, not only does it takes time but there is also a risk of being temporarily/permanently blocked by the site (Error 403 Forbidden)...

### List of browers to rotate

In [None]:
HEADERS_LIST = [
  {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:110.0) Gecko/20100101 Firefox/110.0",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50",
    "X-Requested-With": "XMLHttpRequest"
  },
  {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "X-Requested-With": "XMLHttpRequest"
  }
]

### Old: Parallel Webscraping - 20s/movie

In [None]:
from bs4 import BeautifulSoup
import re
from datetime import datetime
import random

def scrape_movie_details(url):
  """
  Scrape the Movie Details section from the-numbers.com
  Returns a dictionary with the structured data
  """
  header = random.choices(HEADERS_LIST)
  
  s = requests.Session()
  r = s.get(url, headers=header, timeout=15)
  soup = BeautifulSoup(r.text, 'html.parser')
  
  # Find the Movie Details section
  movie_details = {'link': url}
  
  # Look for the table with Movie Details
  details_table = soup.find('table', {'class': 'movie-details'})
  if not details_table:
    # Alternative: look for the section by text content
    details_section = soup.find('h2', string='Movie Details')
    if details_section:
      details_table = details_section.find_next('table')
  
  if details_table:
    rows = details_table.find_all('tr')
    for row in rows:
      cells = row.find_all(['td', 'th'])
      if len(cells) >= 2:
          key = cells[0].get_text(strip=True).replace('\xa0', ' ')
          value = cells[1].get_text(strip=True).replace('\xa0', ' ')
          
          # Clean up the key (remove colons and extra spaces)
          key = key.replace(':', '').strip()
          
          # Skip unwanted fields completely
          if key in ['Video Release', 'MPAA Rating', 'Franchise', 'Comparisons']:
              continue
          
          # Handle Production Countries and Languages separately
          if key == 'Production Countries':
              # Check if Languages data is mixed in
              if 'Languages:' in value:
                  parts = value.split('Languages:')
                  movie_details['Production Countries'] = parts[0].strip()
                  if len(parts) > 1:
                      movie_details['Languages'] = parts[1].strip()
              else:
                  movie_details['Production Countries'] = value
          elif key == 'Languages':
              movie_details['Languages'] = value
          else:
              # Store all other fields
              movie_details[key] = value
  
  # Extract earliest release date from Domestic and International releases
  release_dates = []
  
  # Extract dates from Domestic Releases
  if 'Domestic Releases' in movie_details:
      domestic_text = movie_details['Domestic Releases']
      # Look for date patterns like "February 14th, 2025"
      domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
      release_dates.extend(domestic_dates)
  
  # Extract dates from International Releases
  if 'International Releases' in movie_details:
      intl_text = movie_details['International Releases']
      # Look for date patterns like "January 29th, 2025"
      intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
      release_dates.extend(intl_dates)
  
  # Find the earliest date
  if release_dates:
      try:
          # Convert dates to datetime objects for comparison
          parsed_dates = []
          for date_str in release_dates:
              try:
                  # Handle ordinal suffixes (st, nd, rd, th)
                  clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                  parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                  parsed_dates.append(parsed_date)
              except:
                  continue
          
          if parsed_dates:
              earliest_date = min(parsed_dates)
              movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
      except:
          pass
  
  # Remove the original release fields since we now have Release Date
  movie_details.pop('Domestic Releases', None)
  movie_details.pop('International Releases', None)
  
  return movie_details

### New: Async webscraping - 1s/movie

In [None]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import re
from datetime import datetime
import random
from tqdm.asyncio import tqdm

max_concurrency = 10
sem = asyncio.Semaphore(max_concurrency)
timeout_urls = []

async def scrape_movie_details_async(session, header, url):
  """
  Async scraping of movie details.
  """
  async with sem:
    try:
      async with session.get(url, header=header, timeout=25) as r:
        soup = BeautifulSoup(await r.text(), 'html.parser')
        # Find the Movie Details section
        movie_details = {'link': url}
        # Look for the table with Movie Details
        details_table = soup.find('table', {'class': 'movie-details'})
        if not details_table:
          # Alternative: look for the section by text content
          details_section = soup.find('h2', string='Movie Details')
          if details_section:
            details_table = details_section.find_next('table')
        
        if details_table:
          rows = details_table.find_all('tr')
          for row in rows:
            cells = row.find_all(['td', 'th'])
            if len(cells) >= 2:
                key = cells[0].get_text(strip=True).replace('\xa0', ' ')
                value = cells[1].get_text(strip=True).replace('\xa0', ' ')
                
                # Clean up the key (remove colons and extra spaces)
                key = key.replace(':', '').strip()
                
                # Skip unwanted fields completely
                if key in ['Video Release', 'MPAA Rating', 'Franchise', 'Comparisons']:
                    continue
                
                # Handle Production Countries and Languages separately
                if key == 'Production Countries':
                    # Check if Languages data is mixed in
                    if 'Languages:' in value:
                        parts = value.split('Languages:')
                        movie_details['Production Countries'] = parts[0].strip()
                        if len(parts) > 1:
                            movie_details['Languages'] = parts[1].strip()
                    else:
                        movie_details['Production Countries'] = value
                elif key == 'Languages':
                    movie_details['Languages'] = value
                else:
                    # Store all other fields
                    movie_details[key] = value
        
        # Extract earliest release date from Domestic and International releases
        release_dates = []
        
        # Extract dates from Domestic Releases
        if 'Domestic Releases' in movie_details:
            domestic_text = movie_details['Domestic Releases']
            # Look for date patterns like "February 14th, 2025"
            domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
            release_dates.extend(domestic_dates)
        
        # Extract dates from International Releases
        if 'International Releases' in movie_details:
            intl_text = movie_details['International Releases']
            # Look for date patterns like "January 29th, 2025"
            intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
            release_dates.extend(intl_dates)
        
        # Find the earliest date
        if release_dates:
            try:
                # Convert dates to datetime objects for comparison
                parsed_dates = []
                for date_str in release_dates:
                    try:
                        # Handle ordinal suffixes (st, nd, rd, th)
                        clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                        parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                        parsed_dates.append(parsed_date)
                    except:
                        continue
                
                if parsed_dates:
                    earliest_date = min(parsed_dates)
                    movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
            except:
                pass
        
        # Remove the original release fields since we now have Release Date
        movie_details.pop('Domestic Releases', None)
        movie_details.pop('International Releases', None)
        return movie_details
    
    except asyncio.TimeoutError:
        print(f"Timeout error for {url}")
        timeout_urls.append(url)
        return {}
    except Exception as e:
        print(f"Error for {url}: {e}")
        return {}

async def scrape_batch(session, header, urls):
  """Async scraping, continued - callable function."""
  if session is None:
    async with aiohttp.ClientSession() as session:
      tasks = [scrape_movie_details_async(session, header, url) for url in urls]
      return await tqdm.gather(*tasks)
  else:
    tasks = [scrape_movie_details_async(session, header, url) for url in urls]
    return await tqdm.gather(*tasks)

In [None]:
# Collect all movie details as Series 
all_movie_details = []
urls = [df_WW_all['link'][x:x+200] for x in range(0, len(df_WW_all), 200)]

async def process_batch_with_error_handling(session, header, urls, batch_num):
    """Process one batch with error handling"""
    try:
        print(f"========== PROCESSING BATCH {batch_num:02d} ({len(urls)} URLs)... ==========")
        results = await scrape_batch(urls, session, header)
        
        # Filter out None results (failed scrapes)
        valid_results = [r for r in results if r is not None]
        
        if valid_results:
            # Convert to DataFrame
            df = pd.DataFrame(valid_results)
            
            # Save to CSV
            filename = f'Movie Details/movie_details_{batch_num:02d}.csv'
            df.to_csv(filename, index=False)
            print(f"✅ Batch {batch_num:02d} completed: {len(valid_results)} movies saved to {filename}")
            return len(valid_results)
        else:
            print(f"⚠️ Batch {batch_num:02d} completed but no valid data")
            return 0
            
    except Exception as e:
        print(f"❌ Error in batch {batch_num:02d}: {e}")
        return 0

async def process_all_batches():
    """Process all batches with error handling"""
    total_processed = 0

    async with aiohttp.ClientSession() as session:
        s = 13 # + 1
        batch_num = s
        for batch_urls in urls[s:s]:
            batch_num += 1
            header = random.choices(HEADERS_LIST)
            processed_count = await process_batch_with_error_handling(session, header, batch_urls, batch_num)
            total_processed += processed_count
            
            # Small delay between batches to be nice to the server
            await asyncio.sleep(1)
    
    print(f"\n🎉 All batches completed! Total movies processed: {total_processed}")

# Run the batch processing
await process_all_batches()

2800    https://www.the-numbers.com/movie/Kono-sekai-n...
2801    https://www.the-numbers.com/movie/Edge-of-Seve...
2802    https://www.the-numbers.com/movie/Desu-noto-Li...
2803    https://www.the-numbers.com/movie/Asura-The-Ci...
2804    https://www.the-numbers.com/movie/Suddenly-Sev...
                              ...                        
2995    https://www.the-numbers.com/movie/Big-Sick-The...
2996    https://www.the-numbers.com/movie/Confidential...
2997    https://www.the-numbers.com/movie/Downsizing#t...
2998    https://www.the-numbers.com/movie/I-Tonya-(201...
2999    https://www.the-numbers.com/movie/Mollys-Game#...
Name: link, Length: 200, dtype: object

## Concatenate to one dataframe -> Export to csv 

In [49]:
import glob
all_movie_details = [pd.read_csv(file) for file in glob.glob('Movie Details/*.csv')]
# Concatenate all series to a dataframe
if all_movie_details:
  movie_details_df = pd.concat(all_movie_details, axis=0, ignore_index=True)
  move_col = movie_details_df.pop('Release Date')
  movie_details_df.insert(1,'Release Date', move_col)
  print("DataFrame shape:", movie_details_df.shape)
  print("\nDataFrame columns:", movie_details_df.columns.tolist())
  print("\n==========WEBSCRAPING COMPLETED==========\nDataFrame content:")
else:
  movie_details_df = pd.DataFrame()

movie_details_df.to_csv('Movie Details/movie_details_0000.csv')
movie_details_df

EmptyDataError: No columns to parse from file