### Importing Selenium and required tools for webscrapping

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

### Creating a new webdriver connection for opening the sites in firefox using it's web-driver tool

In [9]:
service = Service(executable_path='geckodriver.exe')
driver = webdriver.Firefox(service=service)
service = Service(executable_path='geckodriver.exe')
driver1 = webdriver.Firefox(service = service)

### Opening the imdb search page to get the genre list

In [10]:
# Opening the search section to get the genre list
driver.get("https://www.imdb.com/search/title/")

# Click to expand the genre filter using XPath '//*[@id="genreAccordion"]/div[1]/label'
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="genreAccordion"]/div[1]/label'))))

# Find the section element using the XPath '//*[@id="accordion-item-genreAccordion"]/div/section'
section_xpath = '//*[@id="accordion-item-genreAccordion"]/div/section'
section_element = driver.find_element("xpath", section_xpath)

# Find all genre button elements within the section
button_elements = section_element.find_elements("tag name", "button")

genre_list = []
# Extract and print the text from each genre button
for button in button_elements:
    # print(button.text)
    genre_list.append(button.text.lower())

# Remove the short genre because it is not present in movies
genre_list.remove('short')

### Creating dataframes to collect the movie data for each genre and exporting to csv files

In [11]:
# Define the columns for DataFrames
columns = ['name', 'year', 'runtime', 'agerating', 'summary', 'userrating', 'noofusers', 'metascore', 'director', 'star1', 'star2', 'star3']

review_columns = ['name', 'reviewrate', 'reviewdate', 'reviewtext']
movie_reviews = {element: [] for element in review_columns}
movie_set = set()

# Number of DataFrames
num_genre = len(genre_list)

# Create and export DataFrames for each genre
for i in range(num_genre):
    # Opening the link for each genre to get the top 20 movies
    driver.get("https://www.imdb.com/search/title/?title_type=feature&genres="+genre_list[i]+"&sort=num_votes,desc")
    
    # Creating a dictionary to store the movie details
    data = {element: [] for element in columns}

    for j in range(1,21):
        # Scroll the elements into view for each movie
        driver.execute_script("arguments[0].scrollIntoView();", WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li['+str(j)+']/div[1]/div/div/div[1]/div[2]'))))
        
        title_link = driver.find_element(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li['+str(j)+']/div[1]/div/div/div[1]/div[2]/div[1]/a').get_attribute('href')
        title_link = title_link.split('?')[0]
        reviews_link = title_link + 'reviews?sort=submissionDate&dir=desc&ratingFilter=0'

        # Getting the movie details of year, runtime, age rating, user rating, meta score, no. of users, 
        movieDet = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li['+str(j)+']/div[1]/div/div/div[1]/div[2]').text
        movieDet = movieDet.split('\n')
        # print(movieDet)
        try:
            data['year'].append(movieDet[1])
        except:
            data['year'].append('')
        try:
            data['runtime'].append(movieDet[2])
        except:
            data['runtime'].append('')
        try:
            data['agerating'].append(movieDet[3])
        except:
            data['agerating'].append('')
        try:
            data['userrating'].append(movieDet[4])
        except:
            data['userrating'].append('')
        try:
            data['metascore'].append(movieDet[7])
        except:
            data['metascore'].append('')
        try:
            movieNoOfUsers = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li['+str(j)+']/div[1]/div/div/div[2]/div[2]').text
            data['noofusers'].append(movieNoOfUsers[5:])
            # print(movieNoOfUsers[5:])
        except:
            data['noofusers'].append('0')
        
        # Clicking the info button to get more details of the movie
        driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li['+str(j)+']/div[2]/button'))))
        
        # Waiting for the element to be visible on the info popup
        try:
            WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH,'/html/body/div[4]/div[2]/div/div[2]/div/div/div[3]/div[1]/ul/li')))
        except:
            pass

        # Getting name, summary, director, stars of the movie
        movieName = driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/a/h3').text
        if movieName not in movie_set:
            movie_set.add(movieName)
            driver1.get(reviews_link)
            for k in range(1,11):
                movie_reviews['name'].append(movieName)
                try:
                    movie_review_det = driver1.find_element(By.XPATH, '/html/body/div[2]/div/div[2]/div[3]/div[1]/section/div[2]/div[2]/div['+str(k)+']')
                    movie_reviews['reviewdate'].append(movie_review_det.find_element(By.CLASS_NAME, 'review-date').text)
                except:
                    movie_reviews['reviewdate'].append('')
                # movie_reviews['reviewrate'].append(movie_review_det.find_element(By.CLASS_NAME, ''))
                try:
                    movie_reviews['reviewtext'].append(driver1.find_element(By.XPATH, '/html/body/div[2]/div/div[2]/div[3]/div[1]/section/div[2]/div[2]/div['+str(k)+']/div[1]/div[1]/a').text)
                except:
                    movie_reviews['reviewtext'].append('')
                try:
                    movie_reviews['reviewrate'].append(driver1.find_element(By.XPATH, '/html/body/div[2]/div/div[2]/div[3]/div[1]/section/div[2]/div[2]/div['+str(k)+']/div[1]/div[1]/div[1]/span').text)
                except:
                    movie_reviews['reviewrate'].append('')
                # movie_reviews['reviewdate'].append(driver1.find_element(By.XPATH, '/html/body/div[2]/div/div[2]/div[3]/div[1]/section/div[2]/div[2]/div['+str(k)+']/div[1]/div[1]/div[1]/span[2]').text)
        data['name'].append(movieName)
        try:
            data['summary'].append(driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div/div[2]/div/div/div[2]').text)
        except:
            data['summary'].append('')
        try:
            data['director'].append(driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div/div[2]/div/div/div[3]/div[1]/ul/li').text)
        except:
            data['director'].append('')
        try:
            data['star1'].append(driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div/div[2]/div/div/div[3]/div[2]/ul/li[1]/a').text)
        except:
            data['star1'].append('')
        try:
            data['star2'].append(driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div/div[2]/div/div/div[3]/div[2]/ul/li[2]/a').text)
        except:
            data['star2'].append('')
        try:
            data['star3'].append(driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div/div[2]/div/div/div[3]/div[2]/ul/li[3]/a').text)
        except:
            data['star3'].append('')

        # Click the close button of the more info popup
        driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'/html/body/div[4]/div[2]/div/div[1]/button'))))
    
    # Converting to dataframe and exporting to csv
    df = pd.DataFrame(data)
    df.to_csv('genre_dump_csv/'+genre_list[i]+'.csv', index=False)

df = pd.DataFrame(movie_reviews)
df.to_csv('genre_dump_csv/movie_reviews.csv', index = False)

# Closing the webdriver connection
driver.quit()
driver1.quit()