In [40]:
import pandas as pd
import os

In [41]:
class IMDb:
    def __init__(self):
        assert(os.path.exists("raw/title.ratings.tsv"))
        assert(os.path.exists("raw/title.basics.tsv"))
        
        ratings_df = pd.read_csv('raw/title.ratings.tsv', sep='\t')
        titles_df = pd.read_csv('raw/title.basics.tsv', sep='\t')
        
        self.data_df = pd.merge(titles_df, ratings_df, on='tconst')
        
    def get_movie_by_title(self, title: str):
        """returns a movie, given a title

        Args:
            title (str): title of the movie (capitalisation ignored)

        Returns:
            DataFrame: the movie. if multiple have the same name, the most reviewed one is returned
        """
        result = self.data_df[(data_df['primaryTitle'].str.lower() == title.lower()) & (self.data_df['titleType'] == 'movie')]
        if not result.empty:
            most_reviewed = result.sort_values(by='numVotes', ascending=False).iloc[0]
            return most_reviewed[['primaryTitle', 'startYear', 'averageRating', 'numVotes']]
        else:
            return None
        
    def get_top_rated_movies(self, number: int, min_votes: int, year=None):
        """get the top movies according to rating

        Args:
            number (int): the number of movies to return
            min_votes (int): the minimum number of votes
            year (int, optional): only return movies published during this year. Defaults to None.

        Returns:
            DataFrame: the top `number` movies, published in `year` with >= `min_votes` votes
        """
        movies_df = self.data_df[(data_df['titleType'] == 'movie') & (self.data_df['numVotes'] >= min_votes)] # filter
        
        if year is not None:
            movies_df = movies_df[movies_df["startYear"].astype(int) == year]
        
        sorted_movies = movies_df.sort_values(by='averageRating', ascending=False)
        
        return sorted_movies.head(number)[['primaryTitle', 'startYear', 'averageRating', 'numVotes']]

In [42]:
imdb = IMDb()

  titles_df = pd.read_csv('raw/title.basics.tsv', sep='\t')


In [43]:
imdb.get_movie_by_title("Interstellar")

primaryTitle     Interstellar
startYear                2014
averageRating             8.7
numVotes              2271208
Name: 411533, dtype: object

In [44]:
imdb.get_top_rated_movies(10, 100000, 2024)

Unnamed: 0,primaryTitle,startYear,averageRating,numVotes
756513,Dune: Part Two,2024,8.5,591800
1085636,The Wild Robot,2024,8.2,127389
618517,Wicked,2024,7.7,119378
1360679,Deadpool & Wolverine,2024,7.6,461256
944378,Inside Out 2,2024,7.6,202465
588013,Furiosa: A Mad Max Saga,2024,7.5,265608
1277845,Nosferatu,2024,7.4,131011
833616,The Substance,2024,7.3,230191
803816,Challengers,2024,7.1,141898
856124,Alien: Romulus,2024,7.1,221033
