<a href="https://colab.research.google.com/github/naimdsaiki/Machine-Learning/blob/main/Domestic_Opening_Week_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# 1. SETUP THE URL AND HEADERS
# We target the "All Time Domestic Opening Weekends" chart
url = "https://www.boxofficemojo.com/chart/top_opening_weekend/"

# HEADERS are crucial! They make our script look like a real browser (Chrome)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

print("Fetching data from Box Office Mojo...")

# 2. REQUEST THE PAGE
response = requests.get(url, headers=headers)

# Check if successful
if response.status_code != 200:
    print(f"Failed to retrieve page. Status code: {response.status_code}")
else:
    # 3. PARSE HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the main table (usually has a specific class or is the first table)
    # Box Office Mojo tables are usually inside a div with class 'a-section'
    table = soup.find('table')

    movie_data = []

    # 4. EXTRACT DATA ROW BY ROW
    # We skip the first row usually because it's the header
    rows = table.find_all('tr')[1:]

    for row in rows:
        cols = row.find_all('td')

        # Ensure the row has data (some are empty spacers)
        if len(cols) > 2:
            title = cols[1].get_text(strip=True)
            opening_str = cols[2].get_text(strip=True)
            total_gross_str = cols[4].get_text(strip=True) # Lifetime domestic

            # Clean the currency strings immediately (remove '$' and ',')
            try:
                opening_val = float(opening_str.replace('$', '').replace(',', ''))
                total_val = float(total_gross_str.replace('$', '').replace(',', ''))
            except ValueError:
                continue # Skip if data is messy

            movie_data.append({
                'title': title,
                'domestic_opening': opening_val,
                'domestic_lifetime': total_val
            })

    # 5. CREATE DATAFRAME
    df_mojo = pd.DataFrame(movie_data)

    print(f"Successfully scraped {len(df_mojo)} movies.")
    print(df_mojo.head())

    # 6. SAVE TO CSV
    df_mojo.to_csv('domestic_openings.csv', index=False)
    print("Data saved to 'domestic_openings.csv'")

Fetching data from Box Office Mojo...
Successfully scraped 0 movies.
Empty DataFrame
Columns: []
Index: []
Data saved to 'domestic_openings.csv'


In [1]:
import requests
import pandas as pd

# 1. SETUP URL & HEADERS
url = "https://www.boxofficemojo.com/chart/top_opening_weekend/"
# We still need headers to look like a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

print("Fetching data...")
response = requests.get(url, headers=headers)

if response.status_code == 200:
    # 2. USE PANDAS TO FIND ALL TABLES
    # This returns a list of all tables found on the page
    tables = pd.read_html(response.text)

    print(f"Found {len(tables)} tables on the page.")

    # 3. IDENTIFY THE CORRECT TABLE
    # We loop through them to find the one with 'Opening' and 'Total Gross' columns
    df_mojo = None
    for t in tables:
        if 'Opening' in t.columns and 'Total Gross' in t.columns:
            df_mojo = t
            break

    if df_mojo is not None:
        print("Success! Data table found.")

        # 4. CLEAN THE DATA
        # Rename 'Release' to 'title' to match your other dataset
        df_mojo = df_mojo.rename(columns={'Release': 'title', 'Opening': 'domestic_opening'})

        # Remove '$', ',' and convert to numeric
        # We use regex=True to replace patterns
        cols_to_clean = ['domestic_opening', 'Total Gross']
        for col in cols_to_clean:
            df_mojo[col] = df_mojo[col].astype(str).str.replace(r'[$,]', '', regex=True)
            df_mojo[col] = pd.to_numeric(df_mojo[col], errors='coerce')

        # Select only what we need
        final_df = df_mojo[['title', 'domestic_opening']]

        # Show sample
        print(final_df.head())

        # Save
        final_df.to_csv('domestic_openings.csv', index=False)
        print("\nSaved to 'domestic_openings.csv'")

    else:
        print("Could not identify the correct table. The column names might have changed.")
else:
    print(f"Failed to connect. Status: {response.status_code}")

Fetching data...


  tables = pd.read_html(response.text)


Found 1 tables on the page.
Success! Data table found.
                                        title  domestic_opening
0                           Avengers: Endgame         357115007
1                     Spider-Man: No Way Home         260138569
2                      Avengers: Infinity War         257698183
3  Star Wars: Episode VII - The Force Awakens         247966675
4     Star Wars: Episode VIII - The Last Jedi         220009584

Saved to 'domestic_openings.csv'


In [3]:
import pandas as pd

# 1. LOAD BOTH DATASETS
# If your original file is named something else, change 'movies.csv' below
df_features = pd.read_csv('movies.csv')
df_target = pd.read_csv('domestic_openings.csv')

print(f"Original Features: {len(df_features)} movies")
print(f"Scraped Targets:   {len(df_target)} movies")

# 2. CLEAN TITLES FOR BETTER MATCHING
# Small differences like "Avatar " vs "avatar" can break the merge.
# We strip whitespace and make everything lowercase to maximize matches.
df_features['title_clean'] = df_features['title'].str.strip().str.lower()
df_target['title_clean'] = df_target['title'].str.strip().str.lower()

# 3. MERGE THE DATA
# We use an 'inner' join. This means we only keep movies that exist in BOTH files.
# We need both: Budget (from file 1) AND Opening Revenue (from file 2) to train.
df_merged = pd.merge(df_features, df_target, on='title_clean', how='inner')

# 4. CLEANUP
# The merge might create duplicate columns (like title_x and title_y). Let's tidy up.
# We keep the original 'title_x' as the main title.
df_merged = df_merged.rename(columns={'title_x': 'title'})

# specific columns we want to keep for training
cols_to_keep = ['title', 'budget', 'genres', 'domestic_opening', 'director_name']

# Check if these columns exist (to avoid errors if your CSV is slightly different)
available_cols = [c for c in cols_to_keep if c in df_merged.columns]
final_df = df_merged[available_cols]

print(f"\n--- Merge Complete ---")
print(f"Successfully matched: {len(final_df)} movies.")
print(f"Data Preview:")
print(final_df.head())

# 5. SAVE FINAL DATASET
final_df.to_csv('training_data_final.csv', index=False)
print("\nSaved as 'training_data_final.csv'. You are ready to train!")

Original Features: 4598 movies
Scraped Targets:   200 movies

--- Merge Complete ---
Successfully matched: 188 movies.
Data Preview:
                      title     budget  \
0                    Avatar  237000000   
1         Avengers: Endgame  356000000   
2  Avatar: The Way of Water  350000000   
3    Avengers: Infinity War  300000000   
4   Spider-Man: No Way Home  200000000   

                                              genres  domestic_opening  \
0  ['Action', 'Adventure', 'Fantasy', 'Science Fi...          77025481   
1         ['Adventure', 'Science Fiction', 'Action']         357115007   
2         ['Action', 'Adventure', 'Science Fiction']         134100226   
3         ['Adventure', 'Action', 'Science Fiction']         257698183   
4         ['Action', 'Adventure', 'Science Fiction']         260138569   

   director_name  
0  James Cameron  
1  Anthony Russo  
2  James Cameron  
3      Joe Russo  
4      Jon Watts  

Saved as 'training_data_final.csv'. You are ready to t