# IMDB - Web Scraping with Beautiful Soup

### Step 1: Import Libraries

In [1]:
from bs4 import BeautifulSoup # For parsing the HTML page
import pandas as pd # Data Manipulation and export
from requests import get # Request URL to access content

### Step 2: Create a variable url which contains the link

In [2]:
url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2021-09-30&count=100'
response = get(url)

 ### Step 3: Print the response

In [3]:
print(response.text[:500])



<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         

        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">




        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_


 ### Step 3: Use Beautiful Soup to parse the data

In [4]:
html_soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

In [6]:
first_movie = movie_containers[0]

### Step 4: Initiliaze the lists

In [7]:
names = []
years = []
imdb_ratings = []
votes = []

### Step 5: Start storing the data

In [8]:
for countainer in movie_containers:
    name = countainer.h3.a.text
    names.append(name)
    year = countainer.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
    start_year = year.find('2')
    years.append(year[start_year:start_year+4])
    imdb_rating = float(first_movie.strong.text)
    imdb_ratings.append(imdb_rating)
    vote = int(first_movie.find('span', attrs = {'name':'nv'})['data-value'])
    votes.append(vote)

### Step 6: Verify the stored data

In [9]:
print(names)
print(years)
print(imdb_ratings)
print(votes)    

['Dune', 'Shang-Chi and the Legend of the Ten Rings', 'Free Guy', 'Cry Macho', 'Malignant', 'The Many Saints of Newark', 'Old', 'Candyman', 'Kate', 'No Time to Die', 'Venom: Let There Be Carnage', 'Intrusion', 'Dear Evan Hansen', 'The Suicide Squad', 'Cinderella', 'Halloween Kills', 'Cruella', 'The Last Duel', 'The Tragedy of Macbeth', 'The Voyeurs', 'Spencer', 'Nightbooks', 'The Guilty', "Everybody's Talking About Jamie", 'Black Widow', 'The Card Counter', 'Prisoners of the Ghostland', 'The Green Knight', 'The Father Who Moves Mountains', 'My Son', 'Copshop', 'F9: The Fast Saga', 'Gunpowder Milkshake', 'The Stronghold', 'The Starling', 'Reminiscence', 'Man on Fire', "Don't Breathe 2", 'The Eyes of Tammy Faye', 'The French Dispatch', 'Last Night in Soho', 'Escape Room: Tournament of Champions', '365 Days', 'Venom', 'Jungle Cruise', 'Once Upon a Time... In Hollywood', 'The Power of the Dog', 'After We Fell', "Harry Potter and the Sorcerer's Stone", 'Knives Out', 'Mortal Kombat', 'Avenge

### Step 7: Store the data

In [10]:
df = pd.DataFrame({'Name':names, 'Year':years, 'Rating':imdb_ratings, 'Vote':votes})
df.to_excel('movies.xlsx', index=False, encoding='utf-8')

### Step 8: Check the final output

In [11]:
df.head()

Unnamed: 0,Name,Year,Rating,Vote
0,Dune,2021,8.4,54479
1,Shang-Chi and the Legend of the Ten Rings,2021,8.4,54479
2,Free Guy,2021,8.4,54479
3,Cry Macho,2021,8.4,54479
4,Malignant,2021,8.4,54479
