In [5]:
import random
from bs4 import BeautifulSoup
import requests
import random
import re
from itertools import chain


In [6]:
url = "https://www.imdb.com/search/title/?groups=top_250&sort=user_rating"


In [7]:
def get_year(bs4_expression):

    pattern = r"[0-9]+"
    match = re.search(pattern, bs4_expression)
    return int(match.group())


In [8]:
class Soup:
    def __init__(self, url):
        self.soup = self.get_soup(url)

    def get_soup(self, url):

        html_next_page = requests.get(url).text
        soup = BeautifulSoup(html_next_page, "html.parser")

        return soup

    def get_movie_names_from_page(self):

        list_of_movies = [img["alt"] for img in self.soup.find_all("img")]
        return list_of_movies

    def get_all_directors_of_page(self):
        P_tags_containing_directors = self.soup.find_all("p", class_="")
        directors = [tag.find_all(
            "a")[0].text for tag in P_tags_containing_directors]

        return directors

    def get_all_ratings_of_page(self):
        rating_tags = self.soup.find_all("strong")[2:]

        ratings = [float(tag.text) for tag in rating_tags]
        return ratings

    def get_all_initial_release_dates_of_page(self):
        initial_release_tags = self.soup.find_all(
            "span", class_="lister-item-year text-muted unbold")

        initial_release_dates = [get_year(tag.text)
                                 for tag in initial_release_tags]
        return initial_release_dates

    def get_all_brief_summaries_of_page(self):

        summary_tags = self.soup.find_all("p", class_="text-muted")
        summaries = [summary_tags[i].text.lstrip()
                     for i in range(1, len(summary_tags), 2)]

        return summaries




class All_Soup ():
    def __init__(self, url):
        self.initial_Soup = Soup(url)
        self.all_Soups = self.get_all_movie_soups()

    def get_all_movie_soups(self):

        soup2 = self.get_next_page_soup(self.initial_Soup.soup)
        soup3 = self.get_next_page_soup(soup2.soup)
        soup4 = self.get_next_page_soup(soup3.soup)
        soup5 = self.get_next_page_soup(soup4.soup)

        all_soups = [self.initial_Soup, soup2, soup3, soup4, soup5]

        return all_soups

    def make_next_page_url(self, soup):

        base_url = "https://www.imdb.com"
        extra_url = soup.find_all(
            "a",  class_="lister-page-next next-page")[0]["href"]

        return base_url + extra_url

    def get_next_page_soup(self, soup):
        next_page_url = self.make_next_page_url(soup)

        return Soup(next_page_url)

    def get_all_movie_names(self):

        movie_names = [soup.get_movie_names_from_page()
                       for soup in self.all_Soups]

        return list(chain.from_iterable(movie_names))

    def get_all_directors(self):

        directors = [soup.get_all_directors_of_page()
                     for soup in self.all_Soups]
        return list(chain.from_iterable(directors))

    def get_all_ratings(self):

        ratings = [soup.get_all_ratings_of_page() for soup in self.all_Soups]
        return list(chain.from_iterable(ratings))

    def get_all_initial_release_dates(self):

        initial_release_dates = [
            soup.get_all_initial_release_dates_of_page() for soup in self.all_Soups]
        return list(chain.from_iterable(initial_release_dates))

    def get_all_brief_summary(self):

        brief_summaries = [soup.get_all_brief_summaries_of_page()
                           for soup in self.all_Soups]
        return list(chain.from_iterable(brief_summaries))


In [9]:
all_soups = All_Soup(url)


In [10]:
def get_all_attributes(all_soups):

    all_attributes = {}
    all_attributes["Ratings"] = all_soups.get_all_ratings()
    all_attributes["Names"] = all_soups.get_all_movie_names()
    all_attributes["Directors"] = all_soups.get_all_directors()
    all_attributes["Release Dates"] = all_soups.get_all_initial_release_dates()
    all_attributes["Brief Summary"] = all_soups.get_all_brief_summary()

    return all_attributes

def get_movies_after_a_particular_year(all_attributes, year):
    movies_after_2010 = []
    for i in range(250):
        if int(all_attributes["Release Dates"][i]) > year:
            movies_after_2010.append(all_attributes["Names"][i])
    return movies_after_2010

def print_random_movie_attributes(all_attributes):

    index = random.randint(0, 249)
    print("Name:",all_attributes["Names"][index])
    print("Rating:",all_attributes["Ratings"][index])
    print("Director:",all_attributes["Directors"][index])
    print("Release Date:",all_attributes["Release Dates"][index])
    print("Brief Summary:",all_attributes["Brief Summary"][index])

In [11]:
all_attributes = get_all_attributes(all_soups)
print_random_movie_attributes(all_attributes)


Name: Citizen Kane
Rating: 8.3
Director: Orson Welles
Release Date: 1941
Brief Summary: Following the death of publishing tycoon Charles Foster Kane, reporters scramble to uncover the meaning of his final utterance: 'Rosebud.'


In [12]:
def get_every_movies_data(all_attributes):

    movies_data = []

    for i in range(250):
        movie_data = ["id", "name", "rating", "director", "release_date", "summary"]
        movie_data[0] = i+1
        movie_data[1] = all_attributes["Names"][i]
        movie_data[2] = all_attributes["Ratings"][i]
        movie_data[3] = all_attributes["Directors"][i]
        movie_data[4] = all_attributes["Release Dates"][i]
        movie_data[5] = all_attributes["Brief Summary"][i]


        movies_data.append(tuple(movie_data))

    return movies_data

In [13]:
movies_data = get_every_movies_data(all_attributes)

In [14]:
type(all_attributes["Release Dates"][0])

int

In [19]:
movies_after_2020 = get_movies_after_a_particular_year(all_attributes, 2020)


In [16]:
all_attributes.keys()

dict_keys(['Ratings', 'Names', 'Directors', 'Release Dates', 'Brief Summary'])

In [20]:
len(movies_after_2020)

5

In [18]:
random.choice(movies_after_2020)

'Hamilton'

In [40]:
import sqlite3

# Connect to the database
conn = sqlite3.connect('Top250Movies.db')
cursor = conn.cursor()

# Create a table to store your data
cursor.execute('''CREATE TABLE top250movies
                (id INTEGER PRIMARY KEY,
                 name TEXT,
                 rating REAL,
                 director TEXT,
                 release_date INTEGER,
                 summary TEXT)''')

# Insert your data into the table
data = movies_data
cursor.executemany('INSERT INTO top250movies VALUES (?, ?, ?, ?, ?, ?)', data)

# Save the changes and close the connection
conn.commit()
conn.close()
