# Scraping IMDB User Reviews
- To answer how Americans felt about Barbie vs. Oppenhiemer, we decided the first step was to scrape and collect user reviews from the popular movie rating website IMDB.

### In this build:
- Requirements:
    - Selenium `pip install selenium`
    - Pandas `pip install pandas`
- Links to User Reviews
    - [Oppenhiemer](https://www.imdb.com/title/tt15398776/reviews)
    - [Barbie](https://www.imdb.com/title/tt1517268/reviews)
    

In [89]:
from selenium import webdriver
import selenium
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By  # Import By

PATH = r"PATH_TO_DRIVER" # Update to the path of your WebDriver
def scrape_reviews(url):
    # Setup WebDriver (Ensure you have the correct path to your WebDriver)
    service = Service(PATH)  # You need to pass the PATH to your Service
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)


    # Wait for the page to load
    driver.implicitly_wait(5)
    page = 1

    # We want at least 1000 review, so get 50 at a safe number
    while page < 50:  
        try:
            # Find the load more button on the webpage
            load_more = driver.find_element(By.ID, 'load-more-trigger')  
            # Click on that button
            load_more.click()
            page += 1
        except:
            # If couldn't find any more button to click, stop
            break

    # Find and iterate over each review
    review = driver.find_elements(By.CLASS_NAME, 'review-container')  # Update to use By.CLASS_NAME
    # Set list for each element:
    review_x =[]
    # Run for loop to get 
    for n in range(0, len(review)-1):
        try:
            review_x_tmp = review[n].text
            review_x.append(review_x_tmp)
        except:
            continue

    # Close the WebDriver
    driver.quit()


    return review_x

# URLs for "Barbie" and "Oppenheimer" reviews
barbie_reviews_url = 'https://www.imdb.com/title/tt15398776/reviews' 
oppenheimer_reviews_url = 'https://www.imdb.com/title/tt1517268/reviews'

# Scrape reviews for each movie and save to CSV
oppenheimer_reviews = scrape_reviews(barbie_reviews_url)
barbie_reviews = scrape_reviews(oppenheimer_reviews_url)


In [90]:
# Extract date, and review text from oppenheimer_reviews and barbie_reviews
def create_df(reviews):
    reviews_df = pd.DataFrame(reviews, columns=['review'])
    reviews_df['score'] = reviews_df['review'].apply(lambda x: x.split('\n')[0])
    reviews_df['title'] = reviews_df['review'].apply(lambda x: x.split('\n')[1])
    reviews_df['date'] = reviews_df['review'].apply(lambda x: ' '.join(x.split('\n')[2].split(' ')[-2:]))
    reviews_df['text'] = reviews_df['review'].apply(lambda x: ' '.join(x.split('\n')[3:4]))
    reviews_df.drop(columns=['review'], inplace=True)
    return reviews_df


In [91]:
def filter_date(reviews_df):
    reviews_df['date'] = pd.to_datetime(reviews_df['date'], errors='coerce')
    reviews_df['date'] = reviews_df['date'].dt.strftime('%m %Y')
    reviews_df = reviews_df.dropna(subset=['date'])
    reviews_df = reviews_df[(reviews_df['date'] >= '07 2023') & (reviews_df['date'] < '10 2023')]
    return reviews_df

In [92]:
oppenheimer_reviews_df = create_df(oppenheimer_reviews)
barbie_reviews_df = create_df(barbie_reviews)

oppenheimer_reviews_df = filter_date(oppenheimer_reviews_df)
barbie_reviews_df = filter_date(barbie_reviews_df)

In [82]:
oppenheimer_reviews_df.to_csv('../DATA/oppenheimer_reviews.csv', index=False)
barbie_reviews_df.to_csv('../DATA/barbie_reviews.csv', index=False)