In [1]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd
import cv2, urllib, numpy as np
import matplotlib.pyplot as plt

In [7]:
#create scraper class 
class IMDBScraper():
    """This class is built using bs4 to extract data from the IMDB website. 
       This calss takes number of pages as in input and it then has the following methods: 
       get_movie_info updates the movies dataframe with useful scraped information like movie title, description etc.
       get_movie_images updates the dataframe with local their local paths and saves the images locally
       save_data saves the dataframe as a json file"""
    #contructor method 
    def __init__(self, page_num):
        self.urls = {}
        self.pages = {}
        self.movie_info = {}
        self.movies_dataframe = pd.DataFrame()
        self.movie_links = {}
        self.page_num = page_num
        
        #string concatenation to format URL to load the 
        #correct number of movies for every page and storing the result as a dictionary 
        #same for pages and movie links 
        i = 0 
        for page in range(1,50*page_num + 1,50):
            i += 1 
            self.urls["page_num: {}".format(i)] = 'https://www.imdb.com/search/title/?genres=horror&start={}&explore=title_type,genres&ref_=adv_nxt'.format(page)
            self.pages["page_num: {}".format(i)] = BeautifulSoup(requests.get(self.urls["page_num: {}".format(i)]).text, "html.parser") 
            self.movie_links["page_num: {}".format(i)] = ['https://www.imdb.com' + item.find_all('a')[0].attrs['href'] for item in self.pages["page_num: {}".format(i)].find_all(attrs={'class':'lister-item-header'})]
            

    #define function for extracting movie data    
    def get_movie_info(self):

        list_of_pages = list(self.pages.values())
        #using list comprehension to extract each html block of movie inofrmation from each page 
        for page_item in list_of_pages:
            movie_data = [item for item in page_item.find_all(attrs={'class':'lister-item mode-advanced'})] 
            
            #iterating through each html block 
            for k in range(len(movie_data)):
                #splitting the html block so each line is an element of the list
                movie = str(movie_data[k]).split('\n')


                #defining a dictionary to save movies 
                movie_dict = {}
                #iterating through each element to store movie information in a dictionary
                for j in range(len(movie)):
                    i = movie[j]    
                    if ('a href="/title/' in i) and ('img' in i): 
                        movie_dict['movie_title'] = i[i.index('img alt=') + 9: i.index('" class=')]
                        movie_dict['movie_image'] = i[i.index('loadlate="https') + 10: i.index('" src="https:')].replace('UX67_CR0,0,67,98_AL_','')
                    elif 'movie_title' not in movie_dict:
                        movie_dict['movie_title'] = None
                        movie_dict['movie_image'] = None 
                    if 'lister-item-year text-muted unbold' in i:
                        movie_dict['release_date'] = i[49 :i.index('</span>')]
                    elif 'release_date' not in movie_dict: 
                        movie_dict['release_date'] = None 
                    if '<span class="certificate' in i: 
                        movie_dict['certificate'] = i[26:i.index('</span>')]
                    elif 'certificate' not in movie_dict: 
                        movie_dict['certificate'] = None 
                    if 'span class="runtime"' in i: 
                        movie_dict['duration'] = i[22:i.index('</span>')]
                    elif 'duration' not in movie_dict: 
                        movie_dict['duration'] = None 
                    if '<span class="genre">' in i:
                        i = movie[j + 1]
                        movie_dict['genre'] = i[:i.index('            </span>')]
                    elif 'genre' not in movie_dict: 
                        movie_dict['genre'] = None 
                    if '<div class="inline-block ratings-imdb-rating" data-value=' in i: 
                        movie_dict['rating'] = i[58:i.index('" name="ir"')]
                    elif 'rating' not in movie_dict: 
                        movie_dict['rating'] = None
                    if '<p class="">' in i:
                        i = movie[j - 1]
                        movie_dict['description'] = i[:i.index('</p>')]
                    elif 'description' not in movie_dict: 
                        movie_dict['description'] = None 
                    if '<p class="sort-num_votes-visible">' in i: 
                        i = movie[j + 2]
                        movie_dict['votes'] = i[18:i.index('" name="nv">')]
                    elif 'votes' not in movie_dict: 
                        movie_dict['votes'] = None



                if ('    Director:' in movie) or ('    Directors:'  in movie):
                    movie_dict['directors'] = ''
                    try:
                        try:
                            movie = movie[movie.index('    Directors:') + 1:movie.index('    Stars:')]
                        except:
                            try:
                                movie = movie[movie.index('    Directors:') + 1:movie.index('    Star:')]
                            except:
                                movie = movie[movie.index('    Directors:') + 1:]


                    except:
                        try:
                            movie = movie[movie.index('    Director:') + 1:movie.index('    Stars:')]
                        except:
                            try:
                                movie = movie[movie.index('    Director:') + 1:movie.index('    Star:')]
                            except:
                                movie = movie[movie.index('    Director:') + 1:]


                    for j in range(len(movie)):
                        i = movie[j]  
                        if '<a href="/name/' in i: 
                            movie_dict['directors'] = ','.join([movie_dict['directors'], i[i.index('/">') + 3: i.index('</a>')]])
                    movie_dict['directors'] = movie_dict['directors'][1:]
                elif 'directors' not in movie_dict:
                    movie_dict['directors'] = None

               #resetting movie list     
                movie = str(movie_data[k]).split('\n')

                if ('    Stars:' in movie) or ('    Star:'  in movie):
                    movie_dict['stars'] = ''
                    try:
                        movie = movie[movie.index('    Stars:') + 1:]
                    except: 
                        movie = movie[movie.index('    Star:') + 1:]
                    for j in range(len(movie)):
                        i = movie[j] 
                        if '<a href="/name/' in i:
                            movie_dict['stars'] = ','.join([movie_dict['stars'], i[i.index('/">') + 3: i.index('</a>')]])
                    movie_dict['stars'] = movie_dict['stars'][1:]
                elif 'stars' not in movie_dict:
                    movie_dict['stars'] = None       
                #appends new dictionary to the existing movie dataframe
                self.movies_dataframe = pd.concat([self.movies_dataframe, pd.DataFrame(movie_dict, index = [k])], axis = 0).reset_index(drop = True)   
    
    #defining a function to scrape movie images
    def get_movie_images(self):
        #opens URL for each movie in the dataframe and stores them as an object 
        movie_images = [urllib.request.urlopen(url) for url in self.movies_dataframe['movie_image']]
        #empty list to store local movie paths 
        movie_paths = []
        image_num = 0 
        #iterate through every URL object
        for image in movie_images: 
            #converts the URL object into an image array 
            image_array = np.asarray(bytearray(image.read()))
            #converts image array into an image matrix
            img = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
            #converting the format of image from BGR to RGB so the colours in the correct format 
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            #saves the image into the local directory 
            cv2.imwrite('movie_images/' + 'movie_number_' + str(image_num) + '.jpg', img)
            #saves paths of the saved image in movie paths list 
            movie_paths.append('movie_images/' + 'movie_number_' + str(image_num) + '.jpg')
            
            image_num += 1 
        #appends the local movie paths to the dataframe     
        self.movies_dataframe["movies_local_paths"] = movie_paths
            
        
    #defines function to save dataframe as a json file    
    def save_data(self):
        self.movies_dataframe.to_json('imdb_data.json', indent = 4)

        

        

In [8]:
#creating an object with chosen number of pages 
scraper_object = IMDBScraper(page_num = 1)

#scrape movie info 
scraper_object.get_movie_info()
#scrape movie images
scraper_object.get_movie_images()
#display the dataframe
scraper_object.movies_dataframe


In [6]:
#saves dataframe as a json file 
scraper_object.save_data()