In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

df = pd.read_csv('data/IMDb_Cleaned_Reviews.csv')
analyzer = SentimentIntensityAnalyzer()

# Define a function to compute the compound sentiment score for a given text
def get_sentiment_score(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

# Apply the function to the "Review Content" column and create a new column "Sentiment Score"
df['Sentiment Score'] = df['Review Content'].apply(get_sentiment_score)

# Save the updated DataFrame to a new CSV file
df.to_csv('data/IMDb_Cleaned_Reviews_With_Sentiment.csv', index=False)

# Print the first few rows to verify
print(df.head())

  Movie Name                             Review Title  \
0      Anora            pretty woman meets uncut gems   
1      Anora                         cinderella lives   
2      Anora                         expect like much   
3      Anora                       get hype around it   
4      Anora  chaotic  heartbreaking glimpse survival   

                                      Review Content   Review Date  \
0  anora early career magnum opus sean baker cont...   Nov 3, 2024   
1  watched one best films year  athens festival n...   Oct 4, 2024   
2  movie traumatized abused child grows perpetuat...  Dec 20, 2024   
3  first 40 minutes quite misleading  likely catc...  Dec 19, 2024   
4  sean baker turns anora one movies seem glide e...  Jan 29, 2025   

   Sentiment Score  
0           0.9944  
1           0.9889  
2          -0.4976  
3           0.9848  
4           0.9812  


In [5]:
import requests
from bs4 import BeautifulSoup

def scrape_worldwide_box_office(movie_id):
    # Build the URL by replacing "movieid" with the given movie_id
    url = f"https://www.boxofficemojo.com/title/{movie_id}/credits/"
    
    # Get the page content
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve page for movie ID {movie_id} (status code: {response.status_code})")
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all span elements that contain the text "Worldwide"
    worldwide_spans = soup.find_all('span', string=lambda text: text and "Worldwide" in text)
    
    # Loop through each span to find the parent div and then search for a child with class "money"
    for span in worldwide_spans:
        parent_div = span.find_parent('div')
        if parent_div:
            money_elem = parent_div.find(class_="money")
            if money_elem:
                # Return the box office figure (e.g., "$47,680,966")
                return money_elem.get_text(strip=True)
    
    # If no matching element is found, throw an error
    raise Exception("Worldwide box office figure not found.")

# Example usage:
movie_id = "tt28607951"  # Replace with the actual movie ID
print(scrape_worldwide_box_office(movie_id))

$47,680,966
