#### Imports

In [42]:
from selenium import webdriver
import pandas as pd 
import time
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import requests

### Automation with Selenium

#### Setup Driver 

In [43]:
driver = webdriver.Chrome(executable_path= 'C:\webdrivers\chromedriver.exe')
driver.get('https://imdb.com')
driver.implicitly_wait(5) # seconds


  driver = webdriver.Chrome(executable_path= 'C:\webdrivers\chromedriver.exe')


In [65]:
# maximize window
driver.maximize_window()

# dropdown
dropdown = driver.find_element(By.CLASS_NAME, 'ipc-icon--arrow-drop-down')
dropdown.click()

# avanced search from dropdown menu
element = driver.find_element(By.LINK_TEXT,'Advanced Search')
element.click()


# click on avanced title search
adv_title = driver.find_element(By.LINK_TEXT,'Advanced Title Search')
adv_title.click() 

# select feature film
feature_film = driver.find_element(By.ID,'title_type-1')
feature_film.click() 

# select tv movie
tv_movie = driver.find_element(By.ID,'title_type-2')
tv_movie.click()

# min date
min_date = driver.find_element(By.NAME,'release_date-min')
min_date.click()
min_date.send_keys('1990')
# max date
max_date = driver.find_element(By.NAME,'release_date-max')
max_date.click()
max_date.send_keys('2020')

# rating min
rating_min = driver.find_element(By.NAME,'user_rating-min')
rating_min.click()
dropdown_2 = Select(rating_min)
dropdown_2.select_by_visible_text('1.0')

# rating max
rating_max = driver.find_element(By.NAME,'user_rating-max')
rating_max.click()
dropdown_3 = Select(rating_max)
dropdown_3.select_by_visible_text('10')

# oscar nominated
oscar_nominated = driver.find_element(By.ID,'groups-7')
oscar_nominated.click()

# color
color = driver.find_element(By.ID,'colors-1')
color.click()

# language
language = driver.find_element(By.NAME,'languages')
dropdown_4 = Select(language)
dropdown_4.select_by_visible_text('English')

# 250 results
results_count = driver.find_element(By.ID,'search-count')
dropdown_5 = Select(results_count)
dropdown_5.select_by_index(2)

# submit
submit = driver.find_element(By.XPATH, '(//button[@type="submit"])[2]')
submit.click()




### Extract Information using BS4

In [77]:
# current URL
soup_url = driver.current_url 
soup_url

'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=1990-01-01,2020-12-31&user_rating=1.0,10.0&groups=oscar_nominee&colors=color&languages=en&count=250'

#### Prepare soup 

In [80]:
# get request
response = requests.get(soup_url)
soup_url
# soup object
soup = BeautifulSoup(response.content, 'html.parser')

In [81]:
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Feature Film/TV Movie,
Released between 1990-01-01 and 2020-12-31,
User Rating between 1 and 10,
Oscar-Nominated,
Color,
English
(Sorted by Popularity Ascending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
</script>
<link href="https://www.imdb.com/search/title/

### Start gathering information 

In [82]:
# result items (start point)
list_items = soup.find_all('div', {'class':'lister-item'})
len(list_items)

250

#### Extract Data
- title
- year
- rating
- genre
- duration

In [72]:
# make list of parameters and run through loop comprehension
movie_title = [result.find('h3').find('a').get_text() for result in list_items]
year = [result.find('h3').find('span', {'class':'lister-item-year'}).get_text().replace('(', '').replace(')', '') for result in list_items]
duration = [result.find('span', {'class':'runtime'}).get_text() for result in list_items]
genre = [result.find('span', {'class':'genre'}).get_text().strip() for result in list_items]
rating = [result.find('div', {'class':'ratings-imdb-rating'}).get_text().strip() for result in list_items]

In [73]:
# create dataframe
imdb_df = pd.DataFrame({'Movie Title': movie_title, 'Year': year, 'Duration':duration,
                       'Genre': genre, 'Rating':rating})

In [74]:
imdb_df

Unnamed: 0,Movie Title,Year,Duration,Genre,Rating
0,Knives Out,2019,130 min,"Comedy, Crime, Drama",7.9
1,Avatar,2009,162 min,"Action, Adventure, Fantasy",7.9
2,Black Panther,2018,134 min,"Action, Adventure, Sci-Fi",7.3
3,Interstellar,2014,169 min,"Adventure, Drama, Sci-Fi",8.6
4,The Shawshank Redemption,1994,142 min,Drama,9.3
...,...,...,...,...,...
245,8 Mile,2002,110 min,"Drama, Music",7.2
246,The Favourite,2018,119 min,"Biography, Comedy, Drama",7.5
247,Lost in Translation,2003,102 min,"Comedy, Drama",7.7
248,Moonlight,I 2016,111 min,Drama,7.4


####  Import dataframe to CSV

In [76]:
imdb_df.to_csv('IMDb_Data_multiple_pages.csv', index=False)