# Webcrawling process

In [16]:
import pandas as pd
import requests
import warnings
# warnings.filterwarnings("ignore")

Generate Links

In [2]:
url_DOM = [f'https://www.the-numbers.com/market/{2005 + i}/top-grossing-movies' for i in range(0,21)]
url_INTL = [f'https://www.the-numbers.com/box-office-records/international/all-movies/cumulative/released-in-{2005 + i}' for i in range(0,21)]
url_WW = [f'https://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/released-in-{2005 + i}' for i in range(0,21)]

In [3]:
def expand_paginated_urls(base_urls, start=101, step=100, max_pages=3):
  expanded = []
  years = range(2005,2026)
  for year, base in zip(years, base_urls):
    expanded.append(((year, base)))
    # Probe 101, 201, 301, ...
    for offset in range(start, start + step * max_pages, step):
      paged = f"{base}/{offset}"
      expanded.append((year,paged))
  return expanded

url_DOM_2 = expand_paginated_urls(url_DOM, start=101, step=100, max_pages=3)
url_INTL_2 = expand_paginated_urls(url_INTL, start=101, step=100, max_pages=3)
url_WW_2 = expand_paginated_urls(url_WW, start=101, step=100, max_pages=3)

## Get data tables 2005-2025

In [67]:
from collections import defaultdict
import requests

def get_table_from(urls=list,index=int,mkt=str,keep=list,links=False,to_csv=False):
  header = { #FAKE BROWSER
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
  }
  base_url = 'https://www.the-numbers.com'
  rows_removed = -2 if mkt == 'DOM' else None
  year_to_df = defaultdict(list)  # yr -> list[dataframes]
  s = requests.Session()

  # urls is expected to be an iterable of (year, url) pairs
  for yr, link in urls:
    try:
      r = s.get(link, headers=header, timeout=15)
      df = pd.read_html(r.text)[index]
      df = df[:rows_removed] if rows_removed is not None else df
      df['trunc'] = df['Movie'].astype(str).str.slice(0, 20)

      if links:
        df_link = pd.read_html(r.text, extract_links='body')[index]
        df_link['link'] = df_link['Movie'].apply(lambda v: f'{base_url}{v[1]}' if v[1] is not None else None)
        df_link['Movie'] = df_link['Movie'].apply(lambda v: v[0])
        df = pd.merge(df, df_link[['Movie', 'link']], on='Movie', how='left')

      df = df[keep] if keep else df
      year_to_df[yr].append(df)
    except Exception:
      continue

  # Concatenate all pages per year
  d = {}
  for yr, frames in year_to_df.items():
    if frames:
      d[f"df_{mkt}_{yr}"] = pd.concat(frames, axis=0, ignore_index=True)

  if to_csv == True:
    for name, df in d.items(): 
      df.to_csv(f'{mkt}/{name}.csv', index=False)
  return d

### Domestic:

In [5]:
# df_DOM = get_table_from(url_DOM_2[:],0,mkt='DOM',
# keep=['Movie','trunc','Release Date','Distributor','Genre'])
# df_DOM

### International:

In [6]:
# df_INTL = get_table_from(url_INTL_2[:],1,mkt='INTL',
# keep=['Movie','trunc','Share Of Number One Market','Number One Market'])
# df_INTL

### Worldwide:

In [None]:
# df_WW = get_table_from(url_WW_2[:],1,mkt='WW',links=True,to_csv=True)

## Merge tables to `df_WW`
Phần này vẫn làm ở Python do phải điền thêm data thiếu..

In [7]:
import glob, re
import pandas as pd

# Load all year CSVs from folders and build per-year dataframes
def load_year_frames(folder, prefix):
  files = glob.glob(f"{folder}/{prefix}_*.csv")
  out = {}
  for path in files:
    m = re.search(r"_(\d{4})\.csv$", path)
    if not m:
      continue
    yr = int(m.group(1))
    try:
      df = pd.read_csv(path)
    except Exception:
      continue
    if 'trunc' not in df.columns and 'Movie' in df.columns:
      df['trunc'] = df['Movie'].astype(str).str.slice(0, 20)
    out[yr] = df
  return out

ww_by_year   = load_year_frames('WW',   'df_WW')
intl_by_year = load_year_frames('INTL', 'df_INTL')
dom_by_year  = load_year_frames('DOM',  'df_DOM')

merged_frames = []
for yr, ww0 in ww_by_year.items():
  ww = ww0.copy()
  if yr in intl_by_year:
    ww = pd.merge(
      ww,
      intl_by_year[yr][['trunc','Share Of Number One Market','Number One Market']],
      on='trunc', how='left'
    )
  if yr in dom_by_year:
    ww = pd.merge(
      ww,
      dom_by_year[yr][['trunc','Release Date','Distributor','Genre']],
      on='trunc', how='left'
    )
  ww['Year'] = yr
  if 'trunc' in ww.columns:
    ww = ww.drop(columns=['trunc'])
  merged_frames.append(ww)

# Concatenate all years
if merged_frames:
  df_WW_all = pd.concat(merged_frames, ignore_index=True)
else:
  df_WW_all = pd.DataFrame()

move_col = df_WW_all.pop('Rank')
df_WW_all.insert(11,'Rank',move_col)

move_col = df_WW_all.pop('link')
df_WW_all.insert(1,'link',move_col)

print(f"Years loaded - WW: {len(ww_by_year)}, INTL: {len(intl_by_year)}, DOM: {len(dom_by_year)}")
print(f"Final rows: {len(df_WW_all)}")

Years loaded - WW: 21, INTL: 21, DOM: 21
Final rows: 4829


In [8]:
df_WW_all.to_csv('WW2/WW_all.csv', index=False)

In [9]:
df_WW_all.tail()

Unnamed: 0,Movie,link,Worldwide Box Office,Domestic Box Office,International Box Office,Domestic Share,Share Of Number One Market,Number One Market,Release Date,Distributor,Genre,Rank,Year
4824,Io Sono La Fine Del Mondo,https://www.the-numbers.com/movie/Io-Sono-La-F...,"$10,164,861",,"$10,164,861",,100.0%,Italy,,,,137,2025
4825,Three Kingdoms: Starlit Heroes (三国的星空第一部)…,https://www.the-numbers.com/movie/Three-Kingdo...,"$10,138,632",,"$10,138,632",,,,,,,138,2025
4826,Na derevnyu dedushke (На деревню дедушке)…,https://www.the-numbers.com/movie/Na-derevnyu-...,"$10,124,608",,"$10,124,608",,100.0%,Russia (CIS),,,,139,2025
4827,Dracula: A Love Tale,https://www.the-numbers.com/movie/Dracula-A-Lo...,"$10,092,908",,"$10,092,908",,87.2%,Russia (CIS),,,,140,2025
4828,Red Silk (Красный шелк),https://www.the-numbers.com/movie/Red-Silk-(20...,"$10,016,598",,"$10,016,598",,82.4%,Russia (CIS),,,,141,2025


In [5]:
df_WW_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4829 entries, 0 to 4828
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Movie                       4829 non-null   object
 1   link                        4829 non-null   object
 2   Worldwide Box Office        4829 non-null   object
 3   Domestic Box Office         3496 non-null   object
 4   International Box Office    4786 non-null   object
 5   Domestic Share              3496 non-null   object
 6   Share Of Number One Market  2868 non-null   object
 7   Number One Market           2955 non-null   object
 8   Release Date                3003 non-null   object
 9   Rank                        4829 non-null   int64 
 10  Distributor                 2980 non-null   object
 11  Genre                       3004 non-null   object
 12  Year                        4829 non-null   int64 
dtypes: int64(2), object(11)
memory usage: 490.6+ KB
