# Webcrawling process (cont') - đang bỏ dở, có thể không làm

In [7]:
import pandas as pd
import requests
import warnings
# warnings.filterwarnings("ignore")

In [8]:
df_WW_all = pd.read_csv("WW2/WW_all.csv")
df_WW_all.head()

Unnamed: 0,Movie,link,Worldwide Box Office,Domestic Box Office,International Box Office,Domestic Share,Share Of Number One Market,Number One Market,Release Date,Distributor,Genre,Rank,Year
0,Star Wars Ep. III: Revenge of the Sith,https://www.the-numbers.com/movie/Star-Wars-Ep...,"$902,891,983","$414,378,291","$488,513,692",45.89%,10.6%,United Kingdom,"May 19, 2005",20th Century Fox,Adventure,1,2005
1,Harry Potter and the Goblet of Fire,https://www.the-numbers.com/movie/Harry-Potter...,"$885,923,981","$291,147,424","$594,776,557",32.86%,15.7%,Japan,"Nov 18, 2005",Warner Bros.,Adventure,2,2005
2,"The Chronicles of Narnia: The Lion, the Witch a…",https://www.the-numbers.com/movie/Chronicles-o...,"$720,539,572","$291,710,957","$428,828,615",40.49%,0.0%,New Zealand,"Dec 9, 2005",Walt Disney,Adventure,3,2005
3,War of the Worlds,https://www.the-numbers.com/movie/War-of-the-W...,"$606,836,535","$234,280,354","$372,556,181",38.61%,3.5%,Australia,"Jun 29, 2005",Paramount Pictures,Action,4,2005
4,King Kong,https://www.the-numbers.com/movie/King-Kong-(2...,"$556,906,378","$218,080,025","$338,826,353",39.16%,0.0%,New Zealand,"Dec 14, 2005",Universal,Adventure,5,2005


## Get Movie Details of each film
This is the hardest part, not only does it takes time but there is also a risk of being blocked by the site (Error 403 Forbidden)...

In [None]:
# Scrape Movie Details section (not in table format)
from bs4 import BeautifulSoup
import re
from datetime import datetime

def scrape_movie_details(url):
  """
  Scrape the Movie Details section from a the-numbers.com movie page
  Returns a dictionary with the structured data
  """
  header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
  }
  
  s = requests.Session()
  r = s.get(url, headers=header, timeout=15)
  soup = BeautifulSoup(r.text, 'html.parser')
  
  # Find the Movie Details section
  movie_details = {'link': url}
  
  # Look for the table with Movie Details
  details_table = soup.find('table', {'class': 'movie-details'})
  if not details_table:
    # Alternative: look for the section by text content
    details_section = soup.find('h2', string='Movie Details')
    if details_section:
      details_table = details_section.find_next('table')
  
  if details_table:
    rows = details_table.find_all('tr')
    for row in rows:
      cells = row.find_all(['td', 'th'])
      if len(cells) >= 2:
          key = cells[0].get_text(strip=True).replace('\xa0', ' ')
          value = cells[1].get_text(strip=True).replace('\xa0', ' ')
          
          # Clean up the key (remove colons and extra spaces)
          key = key.replace(':', '').strip()
          
          # Skip unwanted fields completely
          if key in ['Video Release', 'MPAA Rating', 'Franchise', 'Comparisons']:
              continue
          
          # Handle Production Countries and Languages separately
          if key == 'Production Countries':
              # Check if Languages data is mixed in
              if 'Languages:' in value:
                  parts = value.split('Languages:')
                  movie_details['Production Countries'] = parts[0].strip()
                  if len(parts) > 1:
                      movie_details['Languages'] = parts[1].strip()
              else:
                  movie_details['Production Countries'] = value
          elif key == 'Languages':
              movie_details['Languages'] = value
          else:
              # Store all other fields
              movie_details[key] = value
  
  # Extract earliest release date from Domestic and International releases
  release_dates = []
  
  # Extract dates from Domestic Releases
  if 'Domestic Releases' in movie_details:
      domestic_text = movie_details['Domestic Releases']
      # Look for date patterns like "February 14th, 2025"
      domestic_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', domestic_text)
      release_dates.extend(domestic_dates)
  
  # Extract dates from International Releases
  if 'International Releases' in movie_details:
      intl_text = movie_details['International Releases']
      # Look for date patterns like "January 29th, 2025"
      intl_dates = re.findall(r'([A-Za-z]+ \d{1,2}(?:st|nd|rd|th)?, \d{4})', intl_text)
      release_dates.extend(intl_dates)
  
  # Find the earliest date
  if release_dates:
      try:
          # Convert dates to datetime objects for comparison
          parsed_dates = []
          for date_str in release_dates:
              try:
                  # Handle ordinal suffixes (st, nd, rd, th)
                  clean_date = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
                  parsed_date = datetime.strptime(clean_date, '%B %d, %Y')
                  parsed_dates.append(parsed_date)
              except:
                  continue
          
          if parsed_dates:
              earliest_date = min(parsed_dates)
              movie_details['Release Date'] = earliest_date.strftime('%B %d, %Y')
      except:
          pass
  
  # Remove the original release fields since we now have Release Date
  movie_details.pop('Domestic Releases', None)
  movie_details.pop('International Releases', None)
  
  return movie_details

In [None]:
import asyncio
import aiohttp
import time

async def scrape_movie_details_async(session, url):
    """Phiên bản async - nhanh hơn 10-20 lần"""
    try:
        async with session.get(url, timeout=10) as response:
            html = await response.text()
            soup = BeautifulSoup(html, 'html.parser')
            # ... rest of your scraping code ...
            return movie_details
    except:
        return None

async def scrape_multiple_movies(urls):
    """Cào nhiều phim cùng lúc"""
    async with aiohttp.ClientSession() as session:
        tasks = [scrape_movie_details_async(session, url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return results

# Sử dụng:
urls = ['url1', 'url2', 'url3', ...]  # danh sách URLs
results = asyncio.run(scrape_multiple_movies(urls))

In [133]:
# Convert scraped details to DataFrame
def details_to_series(details_dict):
  """
  Convert the scraped movie details dictionary to a DataFrame
  """
  if not details_dict:
    return pd.DataFrame()
  else:
    return pd.DataFrame([details_dict])

In [None]:
# Collect all movie details as Series 
all_movie_details = []
urls = df_WW_all['link'][:2]
for url in urls:
  details = scrape_movie_details(url)
  if details:
    all_movie_details.append(details_to_series(details))
  print("Movie Details extracted:")
  for key, value in details.items():
    print(f"{key}: {value}")

# Concatenate all series to a dataframe
if all_movie_details:
  movie_details_df = pd.concat(all_movie_details, axis=0, ignore_index=True)
  move_col = movie_details_df.pop('Release Date')
  movie_details_df.insert(1,'Release Date', move_col)
  print("DataFrame shape:", movie_details_df.shape)
  print("\nDataFrame columns:", movie_details_df.columns.tolist())
  print("\n==========WEBSCRAPING COMPLETED==========\nDataFrame content:")
else:
  movie_details_df = pd.DataFrame()
movie_details_df.to_csv('WW2/movie_details.csv')
movie_details_df

Movie Details extracted:
link: https://www.the-numbers.com/movie/Star-Wars-Ep-III-Revenge-of-the-Sith#tab=summary
Running Time: 139 minutes
Keywords: Visual Effects,Good vs. Evil,Cyborg,Cloning,War,Betrayal,Death of a Spouse or Fiancée / Fiancé,Space Opera,Filmed in Shepperton Studios, Surrey, England,Filmed in Surrey, England,Filmed in England,Filmed in United Kingdom,Filmed in Elstree Studios, Hertfordshire, England,Filmed in Hertfordshire, England,Filmed in Fox Studios Australia, Sydney, Australia,Filmed in Sydney, Australia,Filmed in New South Wales, Australia,Filmed in Australia,Filmed in Phuket, Thailand,Filmed in Thailand,Filmed in Mount Etna, Italy,Filmed in Italy,Action Adventure
Source: Original Screenplay
Genre: Adventure
Production Method: Animation/Live Action
Creative Type: Science Fiction
Production/Financing Companies: Lucasfilm
Production Countries: United States
Languages: English
Release Date: May 18, 2005
Movie Details extracted:
link: https://www.the-numbers.com/mo

Unnamed: 0,Release Date,link,Running Time,Keywords,Source,Genre,Production Method,Creative Type,Production/Financing Companies,Production Countries,Languages
0,"May 18, 2005",https://www.the-numbers.com/movie/Star-Wars-Ep...,139 minutes,"Visual Effects,Good vs. Evil,Cyborg,Cloning,Wa...",Original Screenplay,Adventure,Animation/Live Action,Science Fiction,Lucasfilm,United States,English
0,"November 18, 2005",https://www.the-numbers.com/movie/Harry-Potter...,150 minutes,"Boarding School,Visual Effects,IMAX: DMR,Famil...",Based on Fiction Book/Short Story,Adventure,Animation/Live Action,Fantasy,"Warner Bros.,Heyday Films","United Kingdom,United States",English


In [132]:
# # Convert the scraped details to DataFrame
# # Test with Ne Zha 2
# url = 'https://www.the-numbers.com/movie/Ne-Zha-2-(2025-China)#tab=summary'
# details = scrape_movie_details(url)
# print("Movie Details extracted:")
# for key, value in details.items():
#   print(f"{key}: {value}")

# movie_details_df = details_to_dataframe(details)
# move_col = movie_details_df.pop('Release Date')
# movie_details_df.insert(0,'Release Date', move_col)
# print("DataFrame shape:", movie_details_df.shape)
# print("\nDataFrame columns:", movie_details_df.columns.tolist())
# print("\nDataFrame content:")
# movie_details_df