In [178]:
import pandas as pd
from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException

In [179]:

chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=chrome_options)
driver.get('https://www.rottentomatoes.com/browse/movies_at_home/sort:popular')


In [180]:
# Hàm lọc
def filter_array(array_elements):
    array = []
    for element in array_elements:
        array_processed = element.text.replace('\n', '').strip()
        array.append(array_processed)
    return array
# Đợi cho phần tử "Load more" hiển thị
wait = WebDriverWait(driver, 5)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
load_more_button = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button[data-qa='dlp-load-more-button']")))

# Bấm nút "Load more" 35 lần (35*30=1050 phim)
click_count = 0
while click_count < 18:
    actions = ActionChains(driver)
    actions.move_to_element(load_more_button).perform()
    try:
        driver.execute_script("arguments[0].click();", load_more_button)
        click_count += 1
        load_more_button = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button[data-qa='dlp-load-more-button']")))
    except:
        break

In [181]:
# Hàm nhấn nút "Show More"
def click_show_more_button(driver):
    try:
        link_elements = driver.find_elements(By.CSS_SELECTOR, 'rt-link[data-modulecastcrewmanager="showMoreBtn:click"][slot="ctaOpen"]') 
        for link_element in link_elements:
            driver.execute_script("arguments[0].click();", link_element)
            time.sleep(2)  # Consider using dynamic waiting instead of fixed sleep times
    except NoSuchElementException:
        pass

# Hàm trích xuất dữ liệu từ trang phim
def extract_data_from_movie_page(driver, movie_link):
    print("Movie Link:", movie_link)
    driver.get(movie_link)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[slot="insetText"] p.name')))  # Wait for actor names to appear
    
    movie_title = driver.find_element(By.CSS_SELECTOR, 'h1').text
    print('Movie:', movie_title)
    # Lọc actors
    actors = driver.find_elements(By.CSS_SELECTOR, 'div[slot="insetText"] p.name')
    actor_names = [actor.text for actor in actors]
    actor_names_str = ', '.join(actor_names)
    print("Actors:", actor_names_str)

    # Lọc descriptions
    description_elements = driver.find_elements(By.CSS_SELECTOR, 'div.synopsis-wrap rt-text:not(.key)')
    descriptions_full = [description.text for description in description_elements]
    descriptions_text = ', '.join(descriptions_full)  # Call the extract_description function
    print("Descriptions:", descriptions_text)

    # Lọc duration 
    try:
        duration_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Runtime"]/ancestor::dt/following-sibling::dd/rt-text')
        duration_text = duration_element.text if duration_element else None
    except NoSuchElementException:
        duration_text = None
    print("Duration:", duration_text)

    # Lọc ratings
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'rt-button[slot="criticsScore"] rt-text[context="label"]')))
        rating_element = driver.find_element(By.CSS_SELECTOR, 'rt-button[slot="criticsScore"] rt-text[context="label"]')
        rating = rating_element.text if rating_element else "N/A"
    except (NoSuchElementException, TimeoutException):
        rating = "N/A"  
    print("Rating:", rating)

    # Lọc release date
    try:
        release_date_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Release Date (Streaming)"]/ancestor::dt/following-sibling::dd/rt-text')
        release_date_text = release_date_element.text if release_date_element else None
    except NoSuchElementException:
        release_date_text = None
    print("Release Date:", release_date_text)

    # Lọc box office
    boxoffice_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Box Office (Gross USA)':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                boxoffice_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    boxoffice_text = boxoffice_element.text if boxoffice_element else None
    print("Box Office:", boxoffice_text)

    # Lọc Certificate
    certificate_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Rating':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                certificate_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    certificate_text = certificate_element.text if certificate_element else None
    print("Certificate (MPAA):", certificate_text)

    # Lọc original_language
    original_language_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Original Language':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                original_language_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    original_language_text = original_language_element.text if original_language_element else None
    print("Original Language:", original_language_text)

    # Lọc companies
    companies_text = None  # Set a default value
    companies_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Production Co':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                companies_elements = dd_element.find_elements(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass

    if companies_elements is not None:
        companies_text = ''.join([element.text for element in companies_elements])
        print("Companies:", companies_text)

    # Lọc movie category
    movie_category_elements = None
    try:
        dt_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Genre"]')
        dd_element = dt_element.find_element(By.XPATH, './../following-sibling::dd')
        movie_category_elements = dd_element.find_elements(By.TAG_NAME, 'rt-link')
    except NoSuchElementException:
        pass
    movie_category_text = ', '.join(set([element.text for element in movie_category_elements])) if movie_category_elements else None
    print("Movie Category:", movie_category_text)

    # Lọc director 
    director_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Director':
                dd_element = dt_element.find_element(By.XPATH, './../following-sibling::dd')
                director_elements = dd_element.find_elements(By.TAG_NAME, 'rt-link')
                break
    except NoSuchElementException:
        pass 
    diretor_text = ', '.join([element.text for element in director_elements]) if director_elements else None
    print("Directors:", diretor_text)

    # Lọc writing credits
    writing_credits_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Screenwriter':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                writing_credits_elements = dd_element.find_elements(By.TAG_NAME,'rt-link')
                break
    except NoSuchElementException:
        pass
    writing_credits_text = None
    if writing_credits_elements is not None:
        writing_credits_text = ', '.join([element.text for element in writing_credits_elements if element.text])
    print("Writing Credits:", writing_credits_text)

    # Thêm dữ liệu đã lọc vào danh sách
    movies_name.append(movie_title)
    release_date.append(release_date_text)
    film_actors.append(actor_names_str)
    duration.append(duration_text)
    country.append(None)
    companies.append(companies_text)
    movie_category.append(movie_category_text)
    ratings.append(rating)
    directors.append(diretor_text)
    certificate.append(certificate_text)
    budget.append(None)
    boxoffice.append(boxoffice_text)
    original_language.append(original_language_text)
    taglines.append(None)
    writing_credits.append(writing_credits_text)
    descriptions.append(descriptions_text)  # Keep this line for adding to the list

# Lọc movie
movies = []
movie_elements = driver.find_element(By.CSS_SELECTOR, 'h1').text
movies.append(movie_elements)

# Lọc movie link
movie_link_elements = driver.find_elements(By.CSS_SELECTOR,'a[data-qa="discovery-media-list-item-caption"]')
movie_links = [element.get_attribute('href') for element in movie_link_elements]

# Khởi tạo danh sách rỗng cho các thuộc tính
movies_name = []
release_date = []
film_actors = []
duration = []
country = []
companies = []
movie_category = []
ratings = []
directors = []
certificate = []
budget= []
boxoffice= []
original_language = []
taglines = []
writing_credits = []
descriptions = []

# Lặp qua các liên kết phim và trích xuất thông tin
count = 0
for movie_link in movie_links:
    if count >= 1000:  # Thay đổi giới hạn từ 50 thành 1000
        break
    extract_data_from_movie_page(driver, movie_link)
    count += 1
# Hàm lọc
def filter_array(array_elements):
    array = []
    for element in array_elements:
        array_processed = element.text.replace('\n', '').strip()
        array.append(array_processed)
    return array

# Hàm xử lý giá trị thiếu
def fill_missing_values(array, length):
    while len(array) < length:
        array.append(None)
    return array

data = pd.DataFrame()

# Thay đổi các giá trị thiếu bằng None hoặc một giá trị mặc định khác
movies_name = fill_missing_values(movies_name, len(movies))
ratings = fill_missing_values(ratings, len(movies))
movie_category = fill_missing_values(movie_category, len(movies))
film_actors = fill_missing_values(film_actors, len(movies))
release_date = fill_missing_values(release_date, len(movies))
duration = fill_missing_values(duration, len(movies))
original_language = fill_missing_values(original_language, len(movies))
companies = fill_missing_values(companies, len(movies))
descriptions = fill_missing_values(descriptions, len(movies))
boxoffice = fill_missing_values(boxoffice, len(movies))
certificate = fill_missing_values(certificate, len(movies))
directors = fill_missing_values(directors, len(movies))
writing_credits = fill_missing_values(writing_credits, len(movies))


Movie Link: https://www.rottentomatoes.com/m/dont_move_2024
Movie: Don't Move
Actors: Adam Schindler, Brian Netto, Kelsey Asbille, Finn Wittrock, Daniel Francis, Dylan Beam
Descriptions: When a killer injects her with a paralytic agent, a woman must run, fight and hide before her body completely shuts down.
Duration: 1h 32m
Rating: 68%
Release Date: Oct 25, 2024
Box Office: None
Certificate (MPAA): R (Language|Some Strong Violence)
Original Language: English
Companies: Raimi Productions,Capstone Studios,Hammerstone Studios
Movie Category: Mystery & Thriller, Horror
Directors: Adam Schindler, Brian Netto
Writing Credits: T.J. Cimfel, David White
Movie Link: https://www.rottentomatoes.com/m/woman_of_the_hour
Movie: Woman of the Hour
Actors: Anna Kendrick, Anna Kendrick, Tony Hale, Kelley Jakle, Max Lloyd-Jones, Jedidiah Goodacre
Descriptions: An aspiring actress crosses paths with a prolific serial killer in '70s LA when they're cast on an episode of "The Dating Game." Based on a true st

In [182]:
len(movie_links)

152

In [183]:
movie_links

['https://www.rottentomatoes.com/m/dont_move_2024',
 'https://www.rottentomatoes.com/m/woman_of_the_hour',
 'https://www.rottentomatoes.com/m/the_wild_robot',
 'https://www.rottentomatoes.com/m/trap_2024',
 'https://www.rottentomatoes.com/m/the_substance',
 'https://www.rottentomatoes.com/m/joker_folie_a_deux',
 'https://www.rottentomatoes.com/m/strange_darling',
 'https://www.rottentomatoes.com/m/caddo_lake',
 'https://www.rottentomatoes.com/m/transformers_one',
 'https://www.rottentomatoes.com/m/speak_no_evil_2024',
 'https://www.rottentomatoes.com/m/longlegs',
 'https://www.rottentomatoes.com/m/late_night_with_the_devil',
 'https://www.rottentomatoes.com/m/beetlejuice_beetlejuice',
 'https://www.rottentomatoes.com/m/alien_romulus',
 'https://www.rottentomatoes.com/m/smile_2022',
 'https://www.rottentomatoes.com/m/am_i_racist',
 'https://www.rottentomatoes.com/m/venom_2018',
 'https://www.rottentomatoes.com/m/oddity',
 'https://www.rottentomatoes.com/m/its_whats_inside',
 'https://ww

In [184]:
len(movies)

1

In [185]:
movies

['Best Movies to Stream at Home (2024)']

In [186]:
len(movies_name)

152

In [187]:
movies_name

["Don't Move",
 'Woman of the Hour',
 'The Wild Robot',
 'Trap',
 'The Substance',
 'Joker: Folie à Deux',
 'Strange Darling',
 'Caddo Lake',
 'Transformers One',
 'Speak No Evil',
 'Longlegs',
 'Late Night with the Devil',
 'Beetlejuice Beetlejuice',
 'Alien: Romulus',
 'Smile',
 'Am I Racist?',
 'Venom',
 'Oddity',
 "It's What's Inside",
 'Canary Black',
 'Deadpool & Wolverine',
 'Piece By Piece',
 'Terrifier',
 'MaXXXine',
 'The Remarkable Life of Ibelin',
 'Brothers',
 'The Shadow Strays',
 'It Ends With Us',
 'Blink Twice',
 "Salem's Lot",
 'John Wick',
 'MadS',
 'Rebel Ridge',
 'Pearl',
 'Azrael',
 'Wolfs',
 'Immaculate',
 'The Beast Within',
 'A Quiet Place: Day One',
 'X',
 'I Saw the TV Glow',
 'Hit Man',
 'The Babadook',
 'Monkey Man',
 'Cuckoo',
 'New Life',
 'Talk to Me',
 'Daddio',
 'Halloween',
 'Reagan',
 'The First Omen',
 'Twisters',
 'The Watchers',
 'Hereditary',
 'Beetlejuice',
 'Civil War',
 'Sinister',
 "Tim Burton's The Nightmare Before Christmas",
 'His House',


In [188]:
len(ratings)

152

In [189]:
ratings

['68%',
 '91%',
 '98%',
 '57%',
 '90%',
 '32%',
 '95%',
 '76%',
 '89%',
 '83%',
 '86%',
 '97%',
 '77%',
 '80%',
 '79%',
 '71%',
 '30%',
 '96%',
 '78%',
 'N/A',
 '78%',
 '84%',
 '62%',
 '72%',
 '98%',
 '41%',
 '90%',
 '57%',
 '74%',
 '45%',
 '86%',
 '93%',
 '95%',
 '93%',
 '73%',
 '67%',
 '71%',
 '40%',
 '87%',
 '94%',
 '84%',
 '95%',
 '98%',
 '89%',
 '78%',
 '94%',
 '94%',
 '76%',
 '96%',
 '18%',
 '83%',
 '75%',
 '32%',
 '90%',
 '83%',
 '81%',
 '63%',
 '95%',
 '100%',
 '92%',
 '68%',
 '88%',
 '41%',
 '32%',
 '82%',
 '17%',
 '79%',
 '95%',
 '70%',
 'N/A',
 '38%',
 '41%',
 '91%',
 '49%',
 '98%',
 '87%',
 '99%',
 '96%',
 '10%',
 '80%',
 '83%',
 '81%',
 '19%',
 '41%',
 '85%',
 '90%',
 '67%',
 'N/A',
 '56%',
 '85%',
 '72%',
 '73%',
 '79%',
 '92%',
 '78%',
 '85%',
 '17%',
 '91%',
 '99%',
 '78%',
 '77%',
 '33%',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'N/A',
 'N/A',
 '',
 '',
 '',
 '',
 '',
 '

In [190]:
len(movie_category)

152

In [191]:
movie_category

['Mystery & Thriller, Horror',
 'Mystery & Thriller, Crime, Drama',
 'Animation, Kids & Family, Adventure',
 'Mystery & Thriller',
 'Drama, Horror',
 'Musical, Crime, Drama',
 'Mystery & Thriller, Horror',
 'Mystery & Thriller',
 'Animation, Adventure, Fantasy, Sci-Fi, Action',
 'Mystery & Thriller, Drama, Horror',
 'Mystery & Thriller, Horror',
 'Mystery & Thriller, Horror',
 'Comedy, Fantasy',
 'Horror, Sci-Fi',
 'Mystery & Thriller, Horror',
 'Documentary, Comedy',
 'Action, Adventure, Fantasy, Sci-Fi',
 'Mystery & Thriller, Horror',
 'Mystery & Thriller, Comedy',
 'Action, Drama',
 'Action, Adventure, Comedy',
 'Animation, Kids & Family, Comedy, Biography',
 'Mystery & Thriller, Horror, Holiday',
 'Mystery & Thriller, Horror',
 'Documentary, Biography',
 'Action, Comedy, Crime',
 'Mystery & Thriller, Action, Crime, Drama',
 'Drama, Romance',
 'Mystery & Thriller',
 'Mystery & Thriller, Horror',
 'Mystery & Thriller, Action',
 'Mystery & Thriller, Horror',
 'Action, Drama',
 'Myster

In [192]:
len(film_actors)

152

In [193]:
film_actors

['Adam Schindler, Brian Netto, Kelsey Asbille, Finn Wittrock, Daniel Francis, Dylan Beam',
 'Anna Kendrick, Anna Kendrick, Tony Hale, Kelley Jakle, Max Lloyd-Jones, Jedidiah Goodacre',
 "Christopher Sanders, Lupita Nyong'O, Pedro Pascal, Catherine O'Hara, Bill Nighy, Kit Connor",
 'M. Night Shyamalan, Josh Hartnett, Ariel Donoghue, Saleka Shyamalan, Hayley Mills, Alison Pill',
 'Coralie Fargeat, Demi Moore, Margaret Qualley, Dennis Quaid, Hugo Diego Garcia, Gore Abrams',
 'Todd Phillips, Joaquin Phoenix, Lady Gaga, Zazie Beetz, Brendan Gleeson, Catherine Keener',
 'JT Mollner, Willa Fitzgerald, Kyle Gallner, Ed Begley Jr., Barbara Hershey, Steven Michael Quezada',
 "Celine Held, Logan George, Dylan O'Brien, Eliza Scanlen, Lauren Ambrose, Eric Lange",
 'Josh Cooley, Chris Hemsworth, Brian Tyree Henry, Scarlett Johansson, Keegan-Michael Key, Steve Buscemi',
 'James Watkins, James McAvoy, Mackenzie Davis, Scoot McNairy, Aisling Franciosi, Alix West Lefler',
 'Oz Perkins, Maika Monroe, Nic

In [194]:
len(release_date)

152

In [195]:
release_date

['Oct 25, 2024',
 'Oct 18, 2024',
 'Oct 15, 2024',
 'Aug 30, 2024',
 'Oct 31, 2024',
 'Oct 29, 2024',
 'Oct 1, 2024',
 'Oct 10, 2024',
 'Oct 22, 2024',
 'Oct 1, 2024',
 'Aug 23, 2024',
 'Apr 19, 2024',
 'Oct 8, 2024',
 'Oct 15, 2024',
 'Nov 15, 2022',
 'Oct 28, 2024',
 'Jun 18, 2019',
 'Aug 20, 2024',
 'Oct 4, 2024',
 'Oct 24, 2024',
 'Oct 1, 2024',
 'Oct 29, 2024',
 'Mar 27, 2018',
 'Aug 2, 2024',
 'Oct 25, 2024',
 'Oct 17, 2024',
 'Oct 17, 2024',
 'Sep 24, 2024',
 'Sep 17, 2024',
 'Oct 3, 2024',
 'Jun 7, 2016',
 'Oct 18, 2024',
 'Sep 6, 2024',
 'Oct 25, 2022',
 'Oct 25, 2024',
 'Sep 27, 2024',
 'Apr 16, 2024',
 'Aug 13, 2024',
 'Jul 30, 2024',
 'Apr 14, 2022',
 'Jun 14, 2024',
 'Jun 7, 2024',
 'Nov 5, 2015',
 'Apr 23, 2024',
 'Sep 17, 2024',
 'May 3, 2024',
 'Sep 12, 2023',
 'Jul 30, 2024',
 'Mar 18, 2017',
 'Oct 15, 2024',
 'May 28, 2024',
 'Aug 13, 2024',
 'Jun 28, 2024',
 'Aug 21, 2018',
 'Aug 15, 2008',
 'May 24, 2024',
 'Nov 21, 2015',
 'Jan 1, 2014',
 'Oct 30, 2020',
 'Oct 25, 

In [196]:
len(duration)

152

In [197]:
duration

['1h 32m',
 '1h 29m',
 '1h 42m',
 '1h 45m',
 '2h 21m',
 '2h 18m',
 '1h 36m',
 '1h 45m',
 '1h 44m',
 '1h 50m',
 '1h 41m',
 '1h 33m',
 '1h 45m',
 '1h 59m',
 '1h 56m',
 '1h 41m',
 '1h 52m',
 '1h 38m',
 '1h 44m',
 '1h 43m',
 '2h 8m',
 '1h 33m',
 '1h 25m',
 '1h 41m',
 '1h 44m',
 '1h 29m',
 '2h 24m',
 '2h 10m',
 '1h 42m',
 '1h 54m',
 '1h 41m',
 '1h 28m',
 '2h 11m',
 '1h 42m',
 '1h 25m',
 '1h 48m',
 '1h 29m',
 '1h 37m',
 '1h 40m',
 '1h 45m',
 '1h 40m',
 '1h 53m',
 '1h 33m',
 '1h 53m',
 '1h 42m',
 '1h 25m',
 '1h 35m',
 '1h 41m',
 '1h 31m',
 '2h 21m',
 '1h 59m',
 '2h 2m',
 '1h 42m',
 '2h 7m',
 '1h 32m',
 '1h 49m',
 '1h 49m',
 '1h 15m',
 '1h 33m',
 '1h 42m',
 '2h 2m',
 '2h 11m',
 '1h 35m',
 '1h 50m',
 '1h 22m',
 '1h 46m',
 '1h 46m',
 '1h 46m',
 '2h 6m',
 '1h 49m',
 '1h 39m',
 '1h 44m',
 '1h 40m',
 '1h 30m',
 '1h 44m',
 '1h 37m',
 '1h 54m',
 '1h 39m',
 '1h 42m',
 '1h 56m',
 '1h 49m',
 '2h 6m',
 '1h 36m',
 '1h 34m',
 '2h 56m',
 '1h 36m',
 '1h 28m',
 '1h 37m',
 '1h 41m',
 '2h 15m',
 '2h 44m',
 '2h 

In [198]:
len(original_language)

152

In [199]:
original_language

['English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'Norwegian',
 'English',
 'Indonesian',
 'English',
 'English',
 'English',
 'English',
 'French',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'Australian English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'British English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'French (France)',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 None,
 'English',
 'Spanish',
 'English',
 'English',
 'English',
 '

In [200]:
len(companies)

152

In [201]:
companies

['Raimi Productions,Capstone Studios,Hammerstone Studios',
 'BoulderLight Pictures,AGC Studios',
 'DreamWorks Animation',
 'Blinding Edge Pictures',
 'Working Title Films',
 'Joint Effort',
 'Miramax,Spooky Pictures',
 'Blinding Edge Pictures,K Period Media',
 'Entertainment One,Hasbro Entertainment,Paramount Animation,Nickelodeon Movies',
 'Blumhouse Productions,Universal Pictures',
 'Saturn Films,Range Media Partners,Oddfellows Entertainment',
 'VicScreen,Future Pictures,Good Fiend Films,AGC Studios,Shudder,IFC Films,Spooky Pictures,Image Nation',
 'Tommy Harper,Plan B Entertainment,Marc Toberoff,Tim Burton Productions',
 'Brandywine Productions,Scott Free Productions',
 'Paramount Players,Temple Hill Entertainment',
 'Digital Astronaut,DailyWire+',
 'Pascal Pictures,Columbia Pictures,Tencent Pictures,Marvel Entertainment,Sony Pictures Entertainment (SPE)',
 'Keeper Pictures,Shudder,Nowhere',
 'Such Content,Edith Productions,Boldly Go Productions',
 'Anton,MP Film Production',
 '20th

In [202]:
len(descriptions)

152

In [203]:
descriptions

['When a killer injects her with a paralytic agent, a woman must run, fight and hide before her body completely shuts down.',
 'An aspiring actress crosses paths with a prolific serial killer in \'70s LA when they\'re cast on an episode of "The Dating Game." Based on a true story.',
 'The epic adventure follows the journey of a robot--ROZZUM unit 7134, "Roz" for short -- that is shipwrecked on an uninhabited island and must learn to adapt to the harsh surroundings, gradually building relationships with the animals on the island and becoming the adoptive parent of an orphaned gosling.',
 "A father and teen daughter attend a pop concert, where they realize they're at the center of a dark and sinister event.",
 "Have you ever dreamt of a better version of yourself? You, only better in every way. You should try this new product, it's called The Substance. IT CHANGED MY LIFE. With The Substance, you can generate another you: younger, more beautiful, more perfect. You just have to share time

In [204]:
len(boxoffice)

152

In [205]:
boxoffice

[None,
 None,
 None,
 '$42.7M',
 '$14.5M',
 '$57.8M',
 None,
 None,
 '$58.0M',
 '$36.8M',
 '$74.2M',
 '$10.0M',
 '$288.6M',
 '$105.3M',
 '$105.9M',
 '$12.3M',
 '$213.5M',
 '$1.2M',
 None,
 None,
 '$636.6M',
 '$8.9M',
 None,
 '$14.7M',
 None,
 None,
 None,
 '$148.3M',
 '$22.8M',
 None,
 '$43.0M',
 None,
 None,
 '$8.2M',
 '$422.5K',
 None,
 '$15.7M',
 '$51.9K',
 '$138.9M',
 '$11.6M',
 '$4.5M',
 None,
 '$933.1K',
 '$25.0M',
 '$6.2M',
 None,
 '$47.6M',
 '$978.8K',
 '$47.5M',
 '$26.5M',
 '$20.1M',
 '$267.7M',
 '$19.1M',
 '$44.1M',
 '$73.5M',
 '$68.6M',
 '$48.1M',
 '$5.8M',
 None,
 '$40.8M',
 '$335.4M',
 '$50.1M',
 '$1.6M',
 '$137.3M',
 None,
 None,
 '$159.5M',
 None,
 '$64.1M',
 None,
 None,
 None,
 '$94.2M',
 '$20.6M',
 '$176.0M',
 None,
 None,
 '$544.7K',
 '$15.5M',
 '$21.7M',
 '$25.9M',
 '$92.8M',
 None,
 None,
 '$369.3M',
 '$653.0M',
 None,
 None,
 '$10.2M',
 '$327.5M',
 '$5.0M',
 '$188.0M',
 '$187.7M',
 None,
 '$4.2M',
 '$67.2M',
 '$18.6M',
 '$25.1M',
 '$55.1M',
 '$31.6M',
 '$108.1M',


In [206]:
len(certificate)

152

In [207]:
certificate

['R (Language|Some Strong Violence)',
 'R (Language|A Sexual Reference|Some Drug Use|Violent Content)',
 'PG (Thematic Elements|Action/Peril)',
 'PG-13 (Some Violent Content|Brief Strong Language)',
 'R (Graphic Nudity|Gore|Language|Strong Violent Content)',
 'R (Language Throughout|Brief Full Nudity|Some Sexuality|Some Strong Violence)',
 'R (Sexual Material|Language|Drug Use|Strong/Bloody Violent Content)',
 'PG-13 (Thematic Elements|Some Disturbing/Bloody Images|Brief Strong Language)',
 'PG (Language|Animated Action Throughout|Sci-Fi Violence)',
 'R',
 'R (Disturbing Images|Bloody Violence|Some Language)',
 'R (Language|A Sexual Reference|Some Gore|Violent Content)',
 'PG-13 (Macabre and Bloody Images|Brief Drug Use|Some Suggestive Material|Strong Language|Violent Content)',
 'R (Language|Bloody Violent Content)',
 'R',
 'PG-13',
 'PG-13 (Intense Sci-Fi Violence|Action|Language)',
 'R (Language|Some Bloody Images/Gore)',
 None,
 'R',
 'R (Language Throughout|Gore|Sexual References|

In [208]:
len(directors)

152

In [209]:
directors

['Adam Schindler, Brian Netto',
 'Anna Kendrick',
 'Christopher Sanders',
 'M. Night Shyamalan',
 'Coralie Fargeat',
 'Todd Phillips',
 'JT Mollner',
 'Celine Held, Logan George',
 'Josh Cooley',
 'James Watkins',
 'Oz Perkins',
 'Colin Cairnes, Cameron Cairnes',
 'Tim Burton',
 'Fede Alvarez',
 'Parker Finn',
 'Justin Folk',
 'Ruben Fleischer',
 'Damian McCarthy',
 'Greg Jardin',
 'Pierre Morel',
 'Shawn Levy',
 'Morgan Neville',
 'Damien Leone',
 'Ti West',
 'Benjamin Ree',
 'Max Barbakow',
 'Timo Tjahjanto',
 'Justin Baldoni',
 'Zoë Kravitz',
 'Gary Dauberman',
 'Chad Stahelski',
 'David Moreau',
 'Jeremy Saulnier',
 'Ti West',
 'E.L. Katz',
 'Jon Watts',
 'Michael Mohan',
 'Alexander J. Farrell',
 'Michael Sarnoski',
 'Ti West',
 'Jane Schoenbrun',
 'Richard Linklater',
 'Jennifer Kent',
 'Dev Patel',
 'Tilman Singer',
 'John Rosman',
 'Danny Philippou, Michael Philippou',
 'Christy Hall',
 'John Carpenter',
 'Sean McNamara',
 'Arkasha Stevenson',
 'Lee Isaac Chung',
 'Ishana Shyam

In [210]:
len(writing_credits)

152

In [211]:
writing_credits

['T.J. Cimfel, David White',
 'Ian MacAllister McDonald',
 'Christopher Sanders',
 'M. Night Shyamalan',
 'Coralie Fargeat',
 'Scott Silver, Todd Phillips',
 'JT Mollner',
 'Celine Held, Logan George',
 'Eric Pearson, Andrew Barrer, Gabriel Ferrari',
 'James Watkins',
 'Oz Perkins',
 'Colin Cairnes, Cameron Cairnes',
 'Alfred Gough, Miles Millar',
 'Fede Alvarez, Rodo Sayagues Mendez',
 'Parker Finn',
 'Matt Walsh, Brian A. Hoffman, Justin Folk',
 'Jeff Pinkner, Scott Rosenberg, Kelly Marcel',
 'Damian McCarthy',
 'Greg Jardin',
 'Matthew Kennedy, Matthew Kennedy',
 'Ryan Reynolds, Rhett Reese, Paul Wernick, Zeb Wells, Shawn Levy',
 None,
 'Damien Leone',
 'Ti West',
 'Mats Steen',
 'Macon Blair',
 'Timo Tjahjanto',
 'Christy Hall',
 'Zoë Kravitz, E.T. Feigenbaum',
 'Gary Dauberman',
 'Derek Kolstad',
 'David Moreau',
 'Jeremy Saulnier',
 'Ti West, Mia Goth',
 'Simon Barrett',
 'Jon Watts',
 'Andrew Lobel',
 'Greer Ellison, Alexander J. Farrell',
 'Michael Sarnoski',
 'Ti West',
 'Jane

***$\Rightarrow$ Ta thấy được khi crawl data thì có những trường hợp bị chênh lệch dữ liệu, hầu hết là dòng cuối của các thuộc tính ngoại trừ movie_link đầy đủ ( thuộc tính movies thì sẽ là none ở đầu ) sẽ là none nên ta sẽ bỏ dòng cuối đi.***

In [None]:
# Thay đổi các giá trị bị dư thừa

# Tạo DataFrame từ dữ liệu đã lọc
data = pd.DataFrame({
    'Link': movie_links,  # Fixed variable name here
    'Movie_Name': movies_name,
    'Ratings': ratings,
    'Movie_category': movie_category,
    'Film_Actor': film_actors, 
    'Release_Date': release_date,
    'Duration': duration,
    'Original_Language': original_language,
    'Companies': companies,
    'Description': descriptions,  # Changed variable name here
    'Box Office': boxoffice,
    'Certificate (MPAA)': certificate,
    'Directors': directors,
    'Writing_Credits': writing_credits
})

# Lưu DataFrame vào một tệp CSV
data.to_csv('Tomatoes_Top_Movies.csv', index=False)