# IMDB - Web Scraping with Beautiful Soup

### Step 1: Import Libraries

In [1]:
from bs4 import BeautifulSoup # For parsing the HTML page
import pandas as pd # Data Manipulation and export
from requests import get # Request URL to access content

### Step 2: Create a variable url which contains the link

In [2]:
#Enter any year above 2000
year = '2000'
url = 'http://www.imdb.com/search/title?release_date='+year+'&sort=num_votes,desc&page=1'
response = get(url)

 ### Step 3: Print the response

In [3]:
print(response.text[:500])



<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         

        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">




        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_


 ### Step 3: Use Beautiful Soup to parse the data

In [4]:
html_soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

In [6]:
first_movie = movie_containers[0]

### Step 4: Initiliaze the lists

In [7]:
names = []
imdb_ratings = []
votes = []

### Step 5: Start storing the data

In [8]:
for countainer in movie_containers:
    name = countainer.h3.a.text
    names.append(name)
    imdb_rating = float(countainer.strong.text)
    imdb_ratings.append(imdb_rating)
    vote = int(countainer.find('span', attrs = {'name':'nv'})['data-value'])
    votes.append(vote)

### Step 6: Verify the stored data

In [9]:
print(names)
print(imdb_ratings)
print(votes)    

['Gladiator', 'Memento', 'Snatch', 'Requiem for a Dream', 'X-Men', 'Cast Away', 'American Psycho', 'Unbreakable', 'Mission: Impossible II', 'Meet the Parents', 'O Brother, Where Art Thou?', 'Gone in 60 Seconds', 'Almost Famous', 'The Patriot', 'Crouching Tiger, Hidden Dragon', 'Scary Movie', 'Final Destination', 'Amores Perros', 'Pitch Black', 'The Beach', 'Me, Myself & Irene', 'How the Grinch Stole Christmas', 'Remember the Titans', 'Traffic', 'What Women Want', 'Miss Congeniality', "The Emperor's New Groove", 'Erin Brockovich', 'Chocolat', 'Chicken Run', "Charlie's Angels", 'Battle Royale', 'High Fidelity', 'Road Trip', 'The Perfect Storm', 'In the Mood for Love', "Dude, Where's My Car?", 'Billy Elliot', 'Scream 3', 'Hollow Man', 'Malcolm in the Middle', 'Shanghai Noon', 'What Lies Beneath', 'The 6th Day', 'The Whole Nine Yards', 'Pay It Forward', 'Curb Your Enthusiasm', 'Men of Honor', 'Gilmore Girls', 'Coyote Ugly']
[8.5, 8.4, 8.3, 8.3, 7.4, 7.8, 7.6, 7.3, 6.1, 7.0, 7.7, 6.5, 7.9, 

### Step 7: Store the data

In [10]:

df = pd.DataFrame({'Name':names, 'Rating':imdb_ratings, 'Vote':votes})
df.to_excel('Watchlist of '+year+'.xlsx', index=False, encoding='utf-8')

### Step 8: Check the final output

In [11]:
df.head()

Unnamed: 0,Name,Rating,Vote
0,Gladiator,8.5,1409027
1,Memento,8.4,1175781
2,Snatch,8.3,816189
3,Requiem for a Dream,8.3,799839
4,X-Men,7.4,590443
