In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
# import library
import bs4
import requests
import time
import random as ran
import sys
import pandas as pd

In [None]:
# function for scraping movie's attributions in the block 
def scrape_mblock(movie_block):

    movieb_data ={}

    try:
        movieb_data['name'] = movie_block.find('a').get_text()  # Name of the movie
    except:
        movieb_data['name'] = None

    try:    
        movieb_data['year'] = str(movie_block.find('span',{'class': 'lister-item-year'}).contents[0][1:-1])  # Release year
    except:
        movieb_data['year'] = None

    try:    
        movieb_data['director'] = str(movie_block.findAll('p')[2].find('a').get_text())  # director 
    except:
        movieb_data['director'] = None

    try:
        movieb_data['rating'] = float(movie_block.find('div',{'class':'inline-block ratings-imdb-rating'}).get('data-value'))  #rating
    except:
        movieb_data['rating'] = None

    try:
        div_score = movie_block.find('div',{'class':'inline-block ratings-metascore'})
        score = div_score.find('span').text
        movieb_data['m_score'] = float(score.strip())  # meta score
    except:
        movieb_data['m_score'] = None

    try:
        val = list()
        for s in movie_block.find('p',{'class':'sort-num_votes-visible'}).findAll('span'):
            if s.get('name') == "nv":
                val.append(s.get('data-value'))
        movieb_data['vote'] = int(val[0])
        val[1] = val[1].replace(",", "")
        movieb_data['gross'] = int(val[1])
    except:
        movieb_data['vote'] = None
        movieb_data['gross'] = None

    genres = { 'Action':0,
                'Adventure':0,
                'Animation':0,
                'Biography':0,
                'Comedy':0,
                'Crime':0,
                'Documentary':0,
                'Drama':0,
                'Family':0,
                'Fantasy':0,
                'Film-nor':0,
                'Game-show':0,
                'History':0,
                'Horror':0,
                'Music':0,
                'Musical':0,
                'Mystery':0,
                'News':0,
                'Reality-TV':0,
                'Romance':0,
                'Sci-Fi':0,
                'Sport':0,
                'Talkshow':0,
                'Thriller':0,
                'War':0,
                'Western':0,
                'None-type':0 }
    
    movieb_data.update(genres)

    try:
      genres_block = str(movie_block.find('span',{'class':'genre'}).text)
      genre = genres_block.split(', ')
      for g in genre:
        movieb_data[g.strip()] = 1
    except:
      movieb_data['None-type'] = 1

    return movieb_data

In [None]:
# function for scraping all movie blocks within a single search result page.
def scrape_m_page(movie_blocks):
    
    page_movie_data = []
    num_blocks = len(movie_blocks)
    
    for block in range(num_blocks):
        page_movie_data.append(scrape_mblock(movie_blocks[block]))

    return page_movie_data

In [None]:
# functions to extract all movie data from a single page.
def scrape_this(link,t_count):

    base_url = link
    target = t_count
    current_mcount_start = 0
    current_mcount_end = 0
    remaining_mcount = target - current_mcount_end 
    new_page_number = 1
    movie_data = []
    
    while remaining_mcount > 0:
        url = base_url + str(new_page_number)
        source = requests.get(url).text
        soup = bs4.BeautifulSoup(source,'html.parser')
        movie_blocks = soup.findAll('div',{'class':'lister-item-content'})

        movie_data.extend(scrape_m_page(movie_blocks))   
        
        start = soup.find("div", {"class":"nav"}).find("div", {"class": "desc"}).contents[1].get_text().split("-")[0]
        end = soup.find("div", {"class":"nav"}).find("div", {"class": "desc"}).contents[1].get_text().split("-")[1].split(" ")[0]
        try:
          current_mcount_start = int(start)
          current_mcount_end = int(end)
        except:
          current_mcount_start = int(start.replace(',', ''))
          current_mcount_end = int(end.replace(',', ''))

        remaining_mcount = target - current_mcount_end
        print('\r' + "currently scraping movies from: " + str(current_mcount_start) + " - "+str(current_mcount_end), "| remaining count: " + str(remaining_mcount), flush=True, end ="")
        new_page_number = current_mcount_end + 1
        time.sleep(ran.randint(0, 10))

    return movie_data

In [None]:
# scraping the top movies from 1991 to 2020 on the list
year = 1991
films = []

while (year < 2021):
  print("year : ", year)
  base_scraping_link="https://www.imdb.com/search/title/?release_date="+ str(year) +"-01-01," + str(year) + "-12-31&title_type=feature&sort=boxoffice_gross_us,desc&start="
  top_movies = 500 
  year_film = scrape_this(base_scraping_link,int(top_movies))
  films.extend(year_film)
  print('\r'+"List of top " + str(top_movies) +" movies:" + "\n", end="\n")
  year += 1

df = pd.DataFrame(films)

print(df)

In [None]:
# save data to .csv file 
df.to_csv('movies_1991_2020.csv')