# IMDB - Web Scraping with Beautiful Soup

### Step 1: Import Libraries

In [16]:
from bs4 import BeautifulSoup # For parsing the HTML page
import pandas as pd # Data Manipulation and export
from requests import get # Request URL to access content

### Step 2: Create a variable url which contains the link

In [17]:
#Enter any year above and including 2000
year = '2020'
url = 'http://www.imdb.com/search/title?release_date='+year+'&sort=num_votes,desc&page=1'
response = get(url)

 ### Step 3: Print the response

In [18]:
print(response.text[:500])



<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         

        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">




        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_


 ### Step 3: Use Beautiful Soup to parse the data

In [19]:
html_soup = BeautifulSoup(response.text, 'html.parser')

In [20]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

In [21]:
first_movie = movie_containers[0]

### Step 4: Initiliaze the lists

In [22]:
names = []
years = []
imdb_ratings = []
votes = []

### Step 5: Start storing the data

In [23]:
for countainer in movie_containers:
    name = countainer.h3.a.text
    names.append(name)
    year = countainer.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
    start_year = year.find('2')
    years.append(year[start_year:start_year+4])
    imdb_rating = float(countainer.strong.text)
    imdb_ratings.append(imdb_rating)
    vote = int(countainer.find('span', attrs = {'name':'nv'})['data-value'])
    votes.append(vote)

### Step 6: Verify the stored data

In [24]:
print(names)
print(years)
print(imdb_ratings)
print(votes)    

['Tenet', "The Queen's Gambit", 'Soul', 'Wonder Woman 1984', 'Birds of Prey', 'The Invisible Man', 'Extraction', 'A Quiet Place Part II', 'The Trial of the Chicago 7', 'Enola Holmes', 'The Old Guard', 'Bad Boys for Life', 'Mulan', 'Promising Young Woman', 'Palm Springs', 'Scam 1992: The Harshad Mehta Story', 'Nomadland', 'Borat Subsequent Moviefilm', 'Ted Lasso', 'Onward', 'Dil Bechara', 'I Care a Lot', 'Another Round', 'The Devil All the Time', 'Love and Monsters', 'Sonic the Hedgehog', 'The Father', 'Greenland', 'The Last Dance', 'Sadak 2', 'Soorarai Pottru', 'The Hunt', 'Bridgerton', 'The Haunting of Bly Manor', 'Eurovision Song Contest: The Story of Fire Saga', 'Greyhound', 'Spenser Confidential', 'Project Power', 'Dara of Jasenovac', 'The Undoing', 'The Midnight Sky', 'News of the World', 'Hamilton', 'The Social Dilemma', "I'm Thinking of Ending Things", 'Underwater', 'Tiger King', 'The Outsider', 'Coolie No. 1', 'Bloodshot']
['2020', '2020', '2020', '2020', '2020', '2020', '2020'

### Step 7: Store the data

In [25]:

df = pd.DataFrame({'Name':names, 'Year':years, 'Rating':imdb_ratings, 'Vote':votes})
df.to_excel('movies of '+year+'.xlsx', index=False, encoding='utf-8')

### Step 8: Check the final output

In [26]:
df.head()

Unnamed: 0,Name,Year,Rating,Vote
0,Tenet,2020,7.4,432810
1,The Queen's Gambit,2020,8.6,361473
2,Soul,2020,8.1,283259
3,Wonder Woman 1984,2020,5.4,235966
4,Birds of Prey,2020,6.1,212769
