# Webcrawling process (cont') - đang bỏ dở, có thể không làm

In [17]:
import pandas as pd
import requests
import warnings
# warnings.filterwarnings("ignore")

In [18]:
df_WW_all = pd.read_csv("WW_all.csv")
df_WW_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4829 entries, 0 to 4828
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Movie                       4829 non-null   object
 1   link                        4829 non-null   object
 2   Worldwide Box Office        4829 non-null   object
 3   Domestic Box Office         3496 non-null   object
 4   International Box Office    4786 non-null   object
 5   Domestic Share              3496 non-null   object
 6   Share Of Number One Market  2868 non-null   object
 7   Number One Market           2955 non-null   object
 8   Release Date                3003 non-null   object
 9   Distributor                 2980 non-null   object
 10  Genre                       3004 non-null   object
 11  Rank                        4829 non-null   int64 
 12  Year Recorded               4829 non-null   int64 
dtypes: int64(2), object(11)
memory usage: 490.6+ KB


## Get Movie Details of each film
This is the hardest part, not only does it takes time but there is also a risk of being blocked by the site (Error 403 Forbidden)...

In [19]:
# Scrape Movie Details section (not in table format)
from bs4 import BeautifulSoup
import re
from datetime import datetime

def scrape_movie_details(url):
  """
  Scrape the Movie Details section from a the-numbers.com movie page
  Returns a dictionary with the structured data
  """
  header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  }
  
  s = requests.Session()
  r = s.get(url, headers=header, timeout=15)
  soup = BeautifulSoup(r.text, 'html.parser')
  
  # Find the Movie Details section
  movie_details = {'link': url}
  
  # Look for the table with Movie Details
  details_table = soup.find('table', {'class': 'movie-details'})
  if not details_table:
    # Alternative: look for the section by text content
    details_section = soup.find('h2', string='Movie Details')
    if details_section:
      details_table = details_section.find_next('table')
  
  if details_table:
    rows = details_table.find_all('tr')
    for row in rows:
      cells = row.find_all(['td', 'th'])
      if len(cells) >= 2:
          key = cells[0].get_text(strip=True).replace('\xa0', ' ')
          value = cells[1].get_text(strip=True).replace('\xa0', ' ')
          
          # Clean up the key (remove colons and extra spaces)
          key = key.replace(':', '').strip()
          
          # Skip unwanted fields completely
          if key in ['Video Release', 'MPAA Rating', 'Franchise', 'Comparisons']:
              continue
          
          # Handle Production Countries and Languages separately
          if key == 'Production Countries':
              # Check if Languages data is mixed in
              if 'Languages:' in value:
                  parts = value.split('Languages:')
                  movie_details['Production Countries'] = parts[0].strip()
                  if len(parts) > 1:
                      movie_details['Languages'] = parts[1].strip()
              else:
                  movie_details['Production Countries'] = value
          elif key == 'Languages':
              movie_details['Languages'] = value
          else:
              # Store all other fields
              movie_details[key] = value
  
  # Extract earliest release date from Domestic and International releases
  release_dates = []
  
  # Extract dates from Domestic Releases
  if 'Domestic Releases' in movie_details:
      domestic_text = movie_details['Domestic Releases']
      # Look for date patterns like "February 14th, 2025"
      domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
      release_dates.extend(domestic_dates)
  
  # Extract dates from International Releases
  if 'International Releases' in movie_details:
      intl_text = movie_details['International Releases']
      # Look for date patterns like "January 29th, 2025"
      intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
      release_dates.extend(intl_dates)
  
  # Find the earliest date
  if release_dates:
      try:
          # Convert dates to datetime objects for comparison
          parsed_dates = []
          for date_str in release_dates:
              try:
                  # Handle ordinal suffixes (st, nd, rd, th)
                  clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                  parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                  parsed_dates.append(parsed_date)
              except:
                  continue
          
          if parsed_dates:
              earliest_date = min(parsed_dates)
              movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
      except:
          pass
  
  # Remove the original release fields since we now have Release Date
  movie_details.pop('Domestic Releases', None)
  movie_details.pop('International Releases', None)
  
  return movie_details

## Async function to scrape movie details from 4824 URLs

In [None]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import re
from datetime import datetime
from tqdm import tqdm as TQ

header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
        }
max_concurrency = 10
sem = asyncio.Semaphore(max_concurrency) 

async def scrape_movie_details_async(async_session, url):
  """
  Async version
  """
  async with sem:
    async with async_session.get(url, headers=header, timeout=15) as r:
      soup = BeautifulSoup(await r.text(), 'html.parser')
    
      # Find the Movie Details section
      movie_details = {'link': url}
      
      # Look for the table with Movie Details
      details_table = soup.find('table', {'class': 'movie-details'})
      if not details_table:
        # Alternative: look for the section by text content
        details_section = soup.find('h2', string='Movie Details')
        if details_section:
          details_table = details_section.find_next('table')
      
      if details_table:
        rows = details_table.find_all('tr')
        for row in rows:
          cells = row.find_all(['td', 'th'])
          if len(cells) >= 2:
              key = cells[0].get_text(strip=True).replace('\xa0', ' ')
              value = cells[1].get_text(strip=True).replace('\xa0', ' ')
              
              # Clean up the key (remove colons and extra spaces)
              key = key.replace(':', '').strip()
              
              # Skip unwanted fields completely
              if key in ['Video Release', 'MPAA Rating', 'Franchise', 'Comparisons']:
                  continue
              
              # Handle Production Countries and Languages separately
              if key == 'Production Countries':
                  # Check if Languages data is mixed in
                  if 'Languages:' in value:
                      parts = value.split('Languages:')
                      movie_details['Production Countries'] = parts[0].strip()
                      if len(parts) > 1:
                          movie_details['Languages'] = parts[1].strip()
                  else:
                      movie_details['Production Countries'] = value
              elif key == 'Languages':
                  movie_details['Languages'] = value
              else:
                  # Store all other fields
                  movie_details[key] = value
      
      # Extract earliest release date from Domestic and International releases
      release_dates = []
      
      # Extract dates from Domestic Releases
      if 'Domestic Releases' in movie_details:
          domestic_text = movie_details['Domestic Releases']
          # Look for date patterns like "February 14th, 2025"
          domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
          release_dates.extend(domestic_dates)
      
      # Extract dates from International Releases
      if 'International Releases' in movie_details:
          intl_text = movie_details['International Releases']
          # Look for date patterns like "January 29th, 2025"
          intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
          release_dates.extend(intl_dates)
      
      # Find the earliest date
      if release_dates:
          try:
              # Convert dates to datetime objects for comparison
              parsed_dates = []
              for date_str in release_dates:
                  try:
                      # Handle ordinal suffixes (st, nd, rd, th)
                      clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                      parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                      parsed_dates.append(parsed_date)
                  except:
                      continue
              
              if parsed_dates:
                  earliest_date = min(parsed_dates)
                  movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
          except:
              pass
      
      # Remove the original release fields since we now have Release Date
      movie_details.pop('Domestic Releases', None)
      movie_details.pop('International Releases', None)
      
      # Output for fun hehe
      print("========================================")
      for key, value in movie_details.items():
        print(movie_details)

      return movie_details

async def scrape_all(urls):
  """Async scraping, continued - callable function."""
  async with aiohttp.ClientSession() as session:
    tasks = [scrape_movie_details_async(session, url) for url in urls]
    return await asyncio.gather(*tasks, return_exceptions=False)

In [53]:
# Collect all movie details as Series 
all_movie_details = []
urls = df_WW_all['link'][:5]

all_movie_details = await scrape_all(urls)

Movie Details extracted:
link: https://www.the-numbers.com/movie/War-of-the-Worlds#tab=summary
Running Time: 116 minutes
Keywords: Alien Invasion,Visual Effects,Voiceover/Narration,Set in Illinois,Set in England,Set in France,Set in Italy,Restaurants,War,2000s,End of the World,Car Accident,Action Adventure,Remake
Source: Based on Fiction Book/Short Story
Genre: Action
Production Method: Animation/Live Action
Creative Type: Science Fiction
Production/Financing Companies: Paramount Pictures,DreamWorks Pictures,Amblin Entertainment,Cruise-Wagner
Production Countries: United States
Languages: English
Release Date: June 24, 2005
Movie Details extracted:
link: https://www.the-numbers.com/movie/Chronicles-of-Narnia-The-Lion-the-Witch-and-the-Wardrobe-The#tab=summary
Running Time: 140 minutes
Keywords: Talking Animals,Visual Effects,Alternative Dimensions / Parallel universe,World War II,Family Adventure
Source: Based on Fiction Book/Short Story
Genre: Adventure
Production Method: Animation/Li

In [60]:
all_movie_details = [pd.Series(dict_item) for dict_item in all_movie_details]
# Concatenate all series to a dataframe
if all_movie_details:
  movie_details_df = pd.concat(all_movie_details, axis=1, ignore_index=True).T
  move_col = movie_details_df.pop('Release Date')
  movie_details_df.insert(1,'Release Date', move_col)
  print("DataFrame shape:", movie_details_df.shape)
  print("\nDataFrame columns:", movie_details_df.columns.tolist())
  print("\n==========WEBSCRAPING COMPLETED==========\nDataFrame content:")
else:
  movie_details_df = pd.DataFrame()

movie_details_df.to_csv('movie_details.csv')
movie_details_df

DataFrame shape: (5, 11)

DataFrame columns: ['link', 'Release Date', 'Running Time', 'Keywords', 'Source', 'Genre', 'Production Method', 'Creative Type', 'Production/Financing Companies', 'Production Countries', 'Languages']

DataFrame content:


Unnamed: 0,link,Release Date,Running Time,Keywords,Source,Genre,Production Method,Creative Type,Production/Financing Companies,Production Countries,Languages
0,https://www.the-numbers.com/movie/Star-Wars-Ep...,"May 18, 2005",139 minutes,"Visual Effects,Good vs. Evil,Cyborg,Cloning,Wa...",Original Screenplay,Adventure,Animation/Live Action,Science Fiction,Lucasfilm,United States,English
1,https://www.the-numbers.com/movie/Harry-Potter...,"November 18, 2005",150 minutes,"Boarding School,Visual Effects,IMAX: DMR,Famil...",Based on Fiction Book/Short Story,Adventure,Animation/Live Action,Fantasy,"Warner Bros.,Heyday Films","United Kingdom,United States",English
2,https://www.the-numbers.com/movie/Chronicles-o...,"December 09, 2005",140 minutes,"Talking Animals,Visual Effects,Alternative Dim...",Based on Fiction Book/Short Story,Adventure,Animation/Live Action,Fantasy,"Walt Disney Pictures,Walden Media,Mark Johnson","United States,United Kingdom",English
3,https://www.the-numbers.com/movie/War-of-the-W...,"June 24, 2005",116 minutes,"Alien Invasion,Visual Effects,Voiceover/Narrat...",Based on Fiction Book/Short Story,Action,Animation/Live Action,Science Fiction,"Paramount Pictures,DreamWorks Pictures,Amblin ...",United States,English
4,https://www.the-numbers.com/movie/King-Kong-(2...,"December 14, 2005",189 minutes,"Animals Gone Bad,Creature Feature,Visual Effec...",Original Screenplay,Adventure,Animation/Live Action,Fantasy,Wingnut Films,"New Zealand,United States",English
