In [57]:
import pandas as pd
from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException

In [58]:

chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=chrome_options)
driver.get('https://www.rottentomatoes.com/browse/movies_at_home/sort:popular')


In [59]:
# Hàm lọc
def filter_array(array_elements):
    array = []
    for element in array_elements:
        array_processed = element.text.replace('\n', '').strip()
        array.append(array_processed)
    return array
# Đợi cho phần tử "Load more" hiển thị
wait = WebDriverWait(driver, 5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
load_more_button = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button[data-qa='dlp-load-more-button']")))

# Bấm nút "Load more" 35 lần (35*30=1050 phim)
click_count = 0
while click_count < 18:
    actions = ActionChains(driver)
    actions.move_to_element(load_more_button).perform()
    try:
        driver.execute_script("arguments[0].click();", load_more_button)
        click_count += 1
        load_more_button = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button[data-qa='dlp-load-more-button']")))
    except:
        break

In [60]:
# Hàm nhấn nút "Show More"
def click_show_more_button(driver):
    try:
        link_elements = driver.find_elements(By.CSS_SELECTOR, 'rt-link[data-modulecastcrewmanager="showMoreBtn:click"][slot="ctaOpen"]') 
        for link_element in link_elements:
            driver.execute_script("arguments[0].click();", link_element)
            time.sleep(2)  # Consider using dynamic waiting instead of fixed sleep times
    except NoSuchElementException:
        pass

# Hàm trích xuất dữ liệu về các diễn viên
def extract_actors(driver):
    film_actors = []
    try:
        elements = driver.find_elements(By.CSS_SELECTOR, 'div[slot="insetText"] p.role, div[slot="insetText"] p.name')
        found_director = False
        for element in elements:
            if element.get_attribute("class") == "role":
                role_name = element.text
                if role_name == 'Director':
                    found_director = True
                elif role_name == 'Screenwriter':
                    break
            elif found_director:
                actor_name = element.text
                film_actors.append(actor_name)
    except NoSuchElementException:
        pass
    return film_actors

# Hàm trích xuất mô tả
def extract_description(driver):
    descriptions = []
    try:
        description_elements = driver.find_elements(By.CSS_SELECTOR, 'div.synopsis-wrap rt-text:not(.key)')
        descriptions = [description_element.text for description_element in description_elements]
    except NoSuchElementException:
        pass
    return descriptions

# Hàm trích xuất dữ liệu từ trang phim
def extract_data_from_movie_page(driver, movie_link):
    print("Movie Link:", movie_link)
    driver.get(movie_link)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[slot="insetText"] p.name')))  # Wait for actor names to appear

    film_actors = extract_actors(driver)
    print("Actors:", film_actors)

    descriptions = extract_description(driver)
    print("Descriptions:", descriptions)

    try:
        duration_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Runtime"]/ancestor::dt/following-sibling::dd/rt-text')
        duration_text = duration_element.text if duration_element else None
    except NoSuchElementException:
        duration_text = None
    print("Duration:", duration_text)

    try:
        rating_element = driver.find_element(By.CSS_SELECTOR, 'rt-button[slot="criticsScore"] rt-text[theme="medium"]')
        rating = rating_element.text if rating_element else None
    except NoSuchElementException:
        rating = None
    print("Rating:", rating)

    try:
        release_date_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Release Date (Streaming)"]/ancestor::dt/following-sibling::dd/rt-text')
        release_date_text = release_date_element.text if release_date_element else None
    except NoSuchElementException:
        release_date_text = None
    print("Release Date:", release_date_text)

    # Lọc box office
    boxoffice_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Box Office (Gross USA)':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                boxoffice_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    boxoffice_text = boxoffice_element.text if boxoffice_element else None
    print("Box Office:", boxoffice_text)

    # Lọc Certificate
    certificate_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Rating':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                certificate_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    certificate_text = certificate_element.text if certificate_element else None
    print("Certificate (MPAA):", certificate_text)

    # Lọc original_language
    original_language_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Original Language':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                original_language_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    original_language_text = original_language_element.text if original_language_element else None
    print("Original Language:", original_language_text)

    # Lọc companies
    companies_text = None  # Set a default value
    # Lọc companies
    companies_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Production Co':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                companies_elements = dd_element.find_elements(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass

    if companies_elements is not None:
        companies_text = ''.join([element.text for element in companies_elements])
        print("Companies:", companies_text)
 

    # Lọc movie category
    movie_category_elements = None
    try:
        dt_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Genre"]')
        dd_element = dt_element.find_element(By.XPATH, './../following-sibling::dd')
        movie_category_elements = dd_element.find_elements(By.TAG_NAME, 'rt-link')
    except NoSuchElementException:
        pass
    movie_category_text = ', '.join(set([element.text for element in movie_category_elements])) if movie_category_elements else None
    print("Movie Category:", movie_category_text)

    # Lọc director 
    director_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Director':
                dd_element = dt_element.find_element(By.XPATH, './../following-sibling::dd')
                director_elements = dd_element.find_elements(By.TAG_NAME, 'rt-link')
                break
    except NoSuchElementException:
        pass 
    diretor_text = ', '.join([element.text for element in director_elements]) if director_elements else None
    print("Directors:", diretor_text)

    # Lọc writing credits
    writing_credits_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Screenwriter':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                writing_credits_elements = dd_element.find_elements(By.TAG_NAME,'rt-link')
                break
    except NoSuchElementException:
        pass
    writing_credits_text = None
    if writing_credits_elements is not None:
        writing_credits_text = ', '.join([element.text for element in writing_credits_elements if element.text])
    print("Writing Credits:", writing_credits_text)


    # Thêm dữ liệu đã lọc vào danh sách
    release_date.append(release_date_text)
    film_actors.append(film_actors)
    duration.append(duration_text)
    country.append(None)
    companies.append(companies_text)
    movie_category.append(movie_category_text)
    ratings.append(rating)
    directors.append(diretor_text)
    certificate.append(certificate_text)
    budget.append(None)
    boxoffice.append(boxoffice_text)
    original_language.append(original_language_text)
    taglines.append(None)
    writing_credits.append(writing_credits_text)
    descriptions.append(descriptions)

# Lọc movie
movies = []
try:
    movie_elements = driver.find_elements(By.CSS_SELECTOR, 'div[data-qa="discovery-media-list-item"] span[data-qa="discovery-media-list-item-title"]')
    if movie_elements:
        movies = [movie.text for movie in movie_elements]
    else:
        movies.append(None)
except:
    movies.append(None)

# Lọc movie link
movie_link_elements = driver.find_elements(By.CSS_SELECTOR,'a[data-qa="discovery-media-list-item-caption"]')
movie_links = [element.get_attribute('href') for element in movie_link_elements]

# Khởi tạo danh sách rỗng cho các thuộc tính
release_date = []
film_actors = []
duration = []
country = []
companies = []
movie_category = []
ratings = []
directors = []
certificate = []
budget= []
boxoffice= []
original_language = []
taglines = []
writing_credits = []
descriptions = []

# Lặp qua các liên kết phim và trích xuất thông tin
count = 0
for movie_link in movie_links:
    if count >= 1000:  # Thay đổi giới hạn từ 50 thành 1000
        break
    extract_data_from_movie_page(driver, movie_link)
    count += 1
# Hàm lọc
def filter_array(array_elements):
    array = []
    for element in array_elements:
        array_processed = element.text.replace('\n', '').strip()
        array.append(array_processed)
    return array

# Hàm xử lý giá trị thiếu
def fill_missing_values(array, length):
    while len(array) < length:
        array.append(None)
    return array

data = pd.DataFrame()

# Thay đổi các giá trị thiếu bằng None hoặc một giá trị mặc định khác
ratings = fill_missing_values(ratings, len(movies))
movie_category = fill_missing_values(movie_category, len(movies))
film_actors = fill_missing_values(film_actors, len(movies))
release_date = fill_missing_values(release_date, len(movies))
duration = fill_missing_values(duration, len(movies))
original_language = fill_missing_values(original_language, len(movies))
companies = fill_missing_values(companies, len(movies))
descriptions = fill_missing_values(descriptions, len(movies))
boxoffice = fill_missing_values(boxoffice, len(movies))
certificate = fill_missing_values(certificate, len(movies))
directors = fill_missing_values(directors, len(movies))
writing_credits = fill_missing_values(writing_credits, len(movies))
# Tạo DataFrame từ dữ liệu đã lọc
data = pd.DataFrame({
    'Link': movie_link,  # Fixed variable name here
    'Movie_Name': movies,
    'Ratings': ratings,
    'Movie_category': movie_category,
    'Film_Actor': film_actors,  # Changed variable name here
    'Release_Date': release_date,
    'Duration': duration,
    'Original_Language': original_language,
    'Companies': companies,
    'Description': descriptions,  # Changed variable name here
    'Box Office': boxoffice,
    'Certificate (MPAA)': certificate,
    'Directors': directors,
    'Writing_Credits': writing_credits
})

# Lưu DataFrame vào một tệp CSV
data.to_csv('Tomatoes_Top_Movies.csv', index=False)


Movie Link: https://www.rottentomatoes.com/m/dont_move_2024
Actors: ['Brian Netto', 'Kelsey Asbille', 'Finn Wittrock', 'Daniel Francis', 'Dylan Beam']
Descriptions: ['When a killer injects her with a paralytic agent, a woman must run, fight and hide before her body completely shuts down.']
Duration: 1h 32m
Rating: None
Release Date: Oct 25, 2024
Box Office: None
Certificate (MPAA): R (Language|Some Strong Violence)
Original Language: English
Companies: Raimi Productions,Capstone Studios,Hammerstone Studios
Movie Category: Mystery & Thriller, Horror
Directors: Adam Schindler, Brian Netto
Writing Credits: T.J. Cimfel, David White
Movie Link: https://www.rottentomatoes.com/m/woman_of_the_hour
Actors: ['Anna Kendrick', 'Tony Hale', 'Kelley Jakle', 'Max Lloyd-Jones', 'Jedidiah Goodacre']
Descriptions: ['An aspiring actress crosses paths with a prolific serial killer in \'70s LA when they\'re cast on an episode of "The Dating Game." Based on a true story.']
Duration: 1h 29m
Rating: None
Rele