In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
class Scraping:

  def __init__(self, main_content):
    self.main_content = main_content

  def get_title(self, movie_index):
    try:
      title = self.main_content[movie_index].find( class_="lister-item-header").find('a').get_text()
    except:
      title = np.nan

    return title

  def get_release(self, movie_index):
    try:
      release = self.main_content[movie_index].find_all('span')[1].get_text()
    except:
      release = np.nan
    return release

  def get_certificate(self, movie_index):
    try:
      certificate = self.main_content[movie_index].find(class_="certificate").get_text()
    except:
      certificate = np.nan
    return certificate

  def get_duration(self, movie_index):
    try:
      duration = self.main_content[movie_index].find(class_="runtime").get_text()
    except:
      duration = np.nan
    return duration

  def get_categories(self, movie_index):
    try:
      categories = self.main_content[0].find(class_="genre").get_text().replace("\n","").strip()
    except:
      categories = np.nan
    return categories

  def get_rate(self, movie_index):
    try:
      rate = self.main_content[movie_index].find(class_="ratings-imdb-rating").find('strong').get_text()
    except:
      rate = np.nan
    return rate

  def get_metascore(self, movie_index):
    try:
      metascore = self.main_content[movie_index].find(class_="metascore").get_text().strip()
    except:
      metascore = np.nan
    return metascore

  def get_summary(self, movie_index):
    try:
      summary = self.main_content[movie_index].find_all(class_="text-muted")[2].get_text().replace("\n","").strip()
    except:
      summary = np.nan 
    return summary

  def get_votes(self, movie_index):
    try:
      votes = self.main_content[movie_index].find_all(class_="sort-num_votes-visible")[0].find_all('span')[1].get_text()
    except:
      votes = np.nan
    return votes

  def get_profit(self, movie_index):
    try:
      profit = self.main_content[movie_index].find_all(class_="sort-num_votes-visible")[0].find_all('span')[4].get_text()
    except:
      profit = np.nan   
    return profit


class IMDB_Scraping:

  def __init__(self, movies_quantity, page_length):
    self.page_length = page_length
    self.movies_quantity = movies_quantity
    self.title = []
    self.release = []
    self.certificate = []
    self.duration = []
    self.categories = []
    self.rate = []
    self.metascore = []
    self.summary = []
    self.votes = []
    self.profit = []

  def loop_pages(self):
    title = []
    release = []
    certificate = []
    duration = []
    categories = []
    rate = []
    metascore = []
    summary = []
    votes = []
    profit = []
    for item in range(0,self.movies_quantity,self.page_length):
      page = requests.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2019-01-01,2019-12-31&count='+ str(self.page_length) +'&start=' + str(item) + '&ref_=adv_nxt')
      soup = BeautifulSoup(page.content, 'html.parser')
      site_data = Scraping(soup.find_all(class_="lister-item-content"))

      for movie_index in range(0,self.page_length):
        title.append(site_data.get_title(movie_index))
        release.append(site_data.get_release(movie_index))
        certificate.append(site_data.get_certificate(movie_index))
        duration.append(site_data.get_duration(movie_index))
        categories.append(site_data.get_categories(movie_index))
        rate.append(site_data.get_rate(movie_index))
        metascore.append(site_data.get_metascore(movie_index))
        summary.append(site_data.get_summary(movie_index))
        votes.append(site_data.get_votes(movie_index))
        profit.append(site_data.get_profit(movie_index))
    movies = pd.DataFrame({
    "title": title,
    "release": release,
    "certificate": certificate,
    "duration": duration,
    "categories": categories,
    "rate": rate,
    "metascore": metascore,
    "summary": summary,
    "votes": votes,
    "profit": profit,
    })
    return movies

In [None]:
result = IMDB_Scraping(9000,250)
df = result.loop_pages()

In [None]:
df

Unnamed: 0,title,release,certificate,duration,categories,rate,metascore,summary,votes,profit
0,The Gentlemen,(2019),R,113 min,"Action, Comedy, Crime",7.9,51,An American expat tries to sell off his highly...,200177,
1,"Yes, God, Yes",(2019),R,78 min,"Action, Comedy, Crime",6.1,71,"After an innocent AOL chat turns racy, a Catho...",5085,
2,Knives Out,(2019),PG-13,130 min,"Action, Comedy, Crime",7.9,82,A detective investigates the death of a patria...,409844,$165.36M
3,Parasite,(2019),R,132 min,"Action, Comedy, Crime",8.6,96,Greed and class discrimination threaten the ne...,500637,$53.37M
4,Midsommar,(2019),R,148 min,"Action, Comedy, Crime",7.1,72,A couple travels to Sweden to visit a rural ho...,186818,$27.33M
...,...,...,...,...,...,...,...,...,...,...
8995,H2O,(2019),,135 min,,,,Worried about the water scarcity in his villag...,,
8996,Lu xing gong lue,(2019),,,,,,"In the beautiful Gulangyu Island, there is a s...",,
8997,Goodbye Tomorrow,(2019),,51 min,,,,A man searches desperately for answers as he w...,,
8998,Good Luck 2019,(2019),,58 min,,,,Add a Plot,,


In [None]:
df.to_csv("imdb_dataset.csv")