In [31]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from webdriver_manager.chrome import ChromeDriverManager
import nltk
from nltk.corpus import stopwords
import string
from tqdm import tqdm
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental arrows
                           u"\U0001F900-\U0001F9FF"  # Supplemental symbols & pictographs
                           u"\U0001FA00-\U0001FA6F"  # Symbols & pictographs extended-A
                           u"\U0001FA70-\U0001FAFF"  # Symbols & pictographs extended-B
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)

# Remove stopwords，punctuations，html tags，emojis and transform into lowercase
def preprocess_text(text):
    text = emoji_pattern.sub(r'', text)
    text = text.lower()
    text = text.replace(",", " , ").replace(".", " . ").replace("-", " - ").replace("/", " / ")
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    words = text.split()
    filtered_sentence = " ".join(word.translate(table) for word in words if word not in stop_words)
    return filtered_sentence.strip()

def scrape_imdb_reviews(movie_id):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    url = f"https://www.imdb.com/title/{movie_id}/reviews"
    driver.get(url)

    # Wait for the page to load
    wait = WebDriverWait(driver, 10)

    # Click the btn to load more reviews
    while True:
        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, "button.ipc-see-more__button")
            load_more_button.click()
            wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, ".ipl-load-more__load-indicator")))
            time.sleep(2)  
        except Exception:
            break  

    # Get the full HTML 
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()  

    #Extract movie name
    movie_name = soup.select_one("[data-testid='subtitle']")

    # Extract review data
    title_reviews = []
    content_reviews = []
    review_date = []

    for review_block in soup.select("article.user-review-item"):
        try:
            title = review_block.select_one("h3.ipc-title__text").text.strip()
            content = review_block.select_one("div.ipc-html-content-inner-div").text.strip()
            date = review_block.select_one("li.review-date").text.strip()
            title_reviews.append(title)
            content_reviews.append(content)
            review_date.append(date)
        except AttributeError:
            print("Skipping a re view that cannot be parsed")

    # check the number of extracted reviews
    print(f"number of review titles: {len(title_reviews)}")
    print(f"number of review content: {len(content_reviews)}")
    print(f"number of review date: {len(review_date)}")

    processed_titles = [preprocess_text(title) for title in title_reviews]
    processed_reviews = [preprocess_text(review) for review in content_reviews]

    reviews_df = pd.DataFrame({
        "Movie Name": movie_name,
        "Review Title": processed_titles,
        "Review Content": processed_reviews,
        "Review Date": review_date
    })

    csv_filename = f"{movie_id}_Cleaned_Reviews.csv"
    reviews_df.to_csv(csv_filename, index=False)

    print(reviews_df.head()) 

In [45]:
# Access the IMDB review page
movie_id = "tt6320628" 
scrape_imdb_reviews(movie_id)

Skipping a re view that cannot be parsed
Skipping a re view that cannot be parsed
number of review titles: 21
number of review content: 21
number of review date: 21
                  Movie Name                Review Title  \
0  Spider-Man: Far from Home            teen movie twist   
1  Spider-Man: Far from Home  one best spider  man movie   
2  Spider-Man: Far from Home                     amazing   
3  Spider-Man: Far from Home            effort execution   
4  Spider-Man: Far from Home                      adored   

                                      Review Content   Review Date  
0  movie breath fresh air  fun romp europe favour...   Sep 7, 2019  
1  amazing movie  many surprises movie  3d effect...  Jul 15, 2019  
2  movie much better 1st installment  cgi good gy...  Sep 11, 2019  
3  movie hands best mcu movie yet  beginning end ...   Jul 3, 2019  
4  like folks absoultley adored movie ton action ...  Jul 13, 2019  


In [50]:
# ✅ Function to get IMDb Top 250 Movie IDs
def get_imdb_top_250():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in the background
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    url = "https://www.imdb.com/chart/top/"
    driver.get(url)
    time.sleep(3)  # Allow time for the page to load

    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    movies = []
    # for movie in soup.select("div.sc-ee514ad1-0.kYZRWL.cli-poster-container"):
    #     movie_title = movie.select_one("h3.ipc-title__text")
    #     movie_id = movie["href"].split("/")[2]  # Extracts the movie ID (e.g., tt0111161)
    #     movies.append({"movie_id": movie_id, "title": movie_title})

    for movie in soup.select("h3.ipc-title__text"):  
        movie_title = movie.text.strip()
        movies.append({"Movie Title": movie_title})

    return movies[:5]  # Return top 5 movies (change this if needed)

get_imdb_top_250()

[]

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

data = []

res = requests.get("https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm.I")
#print(res)
soup = BeautifulSoup(res.content, "html.parser")

for card in soup.select('li.ipc-metadata-list-summary-item'):
    data.append({
        "title": card.select_one('h3.ipc-title__text').text.strip()
        # "year": card.select_one('.titleColumn span').text,
        # 'rating': card.select_one('td[class="ratingColumn imdbRating"]').get_text(strip=True)
            })

df = pd.DataFrame(data)
print(df)
#df.to_csv('out.csv', index=False)

Empty DataFrame
Columns: []
Index: []


In [47]:
# importing the module 
import imdb 
   
# creating instance of IMDb 
ia = imdb.IMDb() 
   
# name  
name = "3 idiots"
   
# searching the name  
search = ia.search_movie(name) 
  
  
# loop for printing the name and id 
for i in range(len(search)): 
      
    # getting the id 
    id = search[i].movieID 
      
    # printing it 
    print(search[i]['title'] + " : " + id ) 

3 Idiots : 1187043
3 Idiots : 3685624
3 Idiots : 28238283
3 Idiots w/GUNS : 0222441
3 Idiots and a Wise Man : 21612358
Three Idiots to a Team : 29720863
Mugguru Monagallu : 15121916
3 Idiots on Wheels : 6689378
Scotch Mist - A Tale of Three English Idiots in Search of Britain's Northernmost Monsters : 31444403
Kidnap in Rome : 1575673
3 Idiots : 12049418
3 Idiots : 33501685
3 Idiot Heroes : 30247415
Three Idiots : 34207697
Three Idiots : 16345748
The Idiots : 0154421
3 Idiots Try Candy! : 8474256
God and 3 Idiots : 25393152
Confessions of Three Idiots : 21124554
Idiots Are People Three! : 2179303


In [None]:
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & map symbols
                           u"\U0001F700-\U0001F77F"  # Alchemical symbols
                           u"\U0001F780-\U0001F7FF"  # Geometric shapes
                           u"\U0001F800-\U0001F8FF"  # Supplemental arrows
                           u"\U0001F900-\U0001F9FF"  # Supplemental symbols & pictographs
                           u"\U0001FA00-\U0001FA6F"  # Symbols & pictographs extended-A
                           u"\U0001FA70-\U0001FAFF"  # Symbols & pictographs extended-B
                           u"\U00002702-\U000027B0"  # Dingbats
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)

# Remove stopwords，punctuations，html tags，emojis and transform into lowercase
def preprocess_text(text):
    text = emoji_pattern.sub(r'', text)
    text = text.lower()
    text = text.replace(",", " , ").replace(".", " . ").replace("-", " - ").replace("/", " / ")
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    words = text.split()
    filtered_sentence = " ".join(word.translate(table) for word in words if word not in stop_words)
    return filtered_sentence.strip()

processed_titles = [preprocess_text(title) for title in title_reviews]
processed_reviews = [preprocess_text(review) for review in content_reviews]

reviews_df = pd.DataFrame({
    "Movie Name": movie_name,
    "Review Title": processed_titles,
    "Review Content": processed_reviews,
    "Review Date": review_date
})

csv_filename = f"{movie_id}_Cleaned_Reviews.csv"
reviews_df.to_csv(csv_filename, index=False)

print(reviews_df.head())  

                  Movie Name                Review Title  \
0  Spider-Man: Far from Home            teen movie twist   
1  Spider-Man: Far from Home  one best spider  man movie   
2  Spider-Man: Far from Home                     amazing   
3  Spider-Man: Far from Home            effort execution   
4  Spider-Man: Far from Home                      adored   

                                      Review Content   Review Date  
0  movie breath fresh air  fun romp europe favour...   Sep 7, 2019  
1  amazing movie  many surprises movie  3d effect...  Jul 15, 2019  
2  movie much better 1st installment  cgi good gy...  Sep 11, 2019  
3  movie hands best mcu movie yet  beginning end ...   Jul 3, 2019  
4  like folks absoultley adored movie ton action ...  Jul 13, 2019  
