## **Scrapping top 50 movies data from IMDB**
Here, we are going to scrap the data of top 50 movies from [IMDB](https://www.imdb.com/list/ls055386972/).

In [1]:
### importing libraries
import requests
from bs4 import BeautifulSoup

# url of the webpage that we want to scrap
url = 'https://www.imdb.com/list/ls055386972/'
# making get request
r = requests.get(url)
# getting page content
content = r.content
soup = BeautifulSoup(content, 'html.parser')

In [2]:
# getting the title of the page
title = soup.title
title.string

'The 50 Best Movies Ever Made - IMDb'

In [3]:
for d in soup.find_all('div', class_='lister-item-content'):
    print('FILM :',d.find('a').get_text(), end='  ')
    print('TIME :',d.find('span', class_='runtime').get_text(), end='  ')
    print('YEAR :',d.find('span', class_='lister-item-year text-muted unbold').get_text(), end='  ')
    try:
        print('CERT :',d.find('span', class_='certificate').get_text(), end='  ')
    except:
        print('CERT :',None,end='  ')
    print('RATE :', d.find('span', class_='ipl-rating-star__rating').get_text(), end='  ') 
    print('GENRE :',d.find('span', class_='genre').get_text(), end='  ')
    try:
        print('META :',d.find('span', class_='metascore favorable').get_text(), end='  ')
    except:
        print('META :',None,end='  ')
    for ddd in d.find_all('span'):
        if (ddd.get('name') == 'nv'):
            print(ddd.get('data-value'))
    print('\n\n')

FILM : The Godfather  TIME : 175 min  YEAR : (1972)  CERT : A  RATE : 9.2  GENRE : 
Crime, Drama              META : 100          1628465
134,966,411



FILM : Schindler's List  TIME : 195 min  YEAR : (1993)  CERT : A  RATE : 8.9  GENRE : 
Biography, Drama, History              META : 94          1218207
96,898,818



FILM : 12 Angry Men  TIME : 96 min  YEAR : (1957)  CERT : U  RATE : 9  GENRE : 
Crime, Drama              META : 96          693484
4,360,000



FILM : La vita è bella  TIME : 116 min  YEAR : (1997)  CERT : U  RATE : 8.6  GENRE : 
Comedy, Drama, Romance              META : None  626528
57,598,247



FILM : Il buono, il brutto, il cattivo  TIME : 161 min  YEAR : (1966)  CERT : A  RATE : 8.8  GENRE : 
Western              META : 90          691081
6,100,000



FILM : The Shawshank Redemption  TIME : 142 min  YEAR : (1994)  CERT : A  RATE : 9.3  GENRE : 
Drama              META : 80          2353292
28,341,469



FILM : The Pursuit of Happyness  TIME : 117 min  YEAR : (2006)

In [4]:
# list to store data
FILM = []
TIME = []
YEAR = []
CERTIFICATE = []
IMDB = []
META = []
VOTES = []
GROSS = []
GENRE = []
for d in soup.find_all('div', class_='lister-item-content'):
    FILM.append(d.find('a').get_text())
    TIME.append(d.find('span', class_='runtime').get_text())
    YEAR.append(d.find('span', class_='lister-item-year text-muted unbold').get_text())
    try:
        CERTIFICATE.append(d.find('span', class_='certificate').get_text())
    except:
        CERTIFICATE.append(None)
    IMDB.append(d.find('span', class_='ipl-rating-star__rating').get_text()) 
    GENRE.append(d.find('span', class_='genre').get_text())
    try:
        META.append(d.find('span', class_='metascore favorable').get_text())
    except:
        META.append(None)
    st = ''    
    for ddd in d.find_all('span'):
        if (ddd.get('name') == 'nv'):
            st += ddd.get('data-value') + ' '
    VOTES.append(st)

In [5]:
# checking whether data is missing or not
print(len(FILM))
print(len(TIME))
print(len(YEAR))
print(len(CERTIFICATE))
print(len(IMDB))
print(len(GENRE))
print(len(META))
print(len(VOTES))  # <----- contains both votes and gross earned

50
50
50
50
50
50
50
50


In [6]:
import pandas as pd
# making dataframe
imdb = pd.DataFrame({
    'film':FILM, 'time':TIME, 'year':YEAR, 'certificate':CERTIFICATE, 'imdb_rating':IMDB,
    'genre':GENRE, 'metascore':META, 'votes':VOTES
}, index=range(1,51))
imdb.head(50)

Unnamed: 0,film,time,year,certificate,imdb_rating,genre,metascore,votes
1,The Godfather,175 min,(1972),A,9.2,"\nCrime, Drama",100.0,"1628465 134,966,411"
2,Schindler's List,195 min,(1993),A,8.9,"\nBiography, Drama, History",94.0,"1218207 96,898,818"
3,12 Angry Men,96 min,(1957),U,9.0,"\nCrime, Drama",96.0,"693484 4,360,000"
4,La vita è bella,116 min,(1997),U,8.6,"\nComedy, Drama, Romance",,"626528 57,598,247"
5,"Il buono, il brutto, il cattivo",161 min,(1966),A,8.8,\nWestern,90.0,"691081 6,100,000"
6,The Shawshank Redemption,142 min,(1994),A,9.3,\nDrama,80.0,"2353292 28,341,469"
7,The Pursuit of Happyness,117 min,(2006),U,8.0,"\nBiography, Drama",64.0,"450794 163,566,459"
8,Shichinin no samurai,207 min,(1954),U,8.6,"\nAction, Adventure, Drama",98.0,"316976 269,061"
9,The Intouchables,112 min,(2011),UA,8.5,"\nBiography, Comedy, Drama",,"764687 13,182,281"
10,Central do Brasil,110 min,(1998),R,8.0,\nDrama,80.0,"36556 5,595,428"


In [7]:
# saving data to a csv file
# imdb.to_csv('imdb_tp_50.csv', index=False)