In [4]:
import pandas as pd


class Film:
    def __init__(self, movie_id, imdb_id, title, release_year, genres):
        self.movie_id = movie_id
        self.imdb_id = imdb_id
        self.title = title
        self.release_year = release_year
        self.genres = genres.split("|")


df_movies = pd.read_csv("./Data/raw/movies.csv")
df_links = pd.read_csv("./Data/raw/links.csv")

merged_df = pd.merge(df_movies, df_links, on="movieId")
merged_df.drop(columns=["tmdbId"], inplace=True)

# Extract the year from the title and create a new column
merged_df["year"] = merged_df["title"].str.extract(r"\((\d{4})\)")

# Remove the year from the title
merged_df["title"] = merged_df["title"].str.replace(r"\s*\(\d{4}\)\s*", "", regex=True)

print(merged_df.sample(5))

       movieId                    title                             genres  \
43318   164767      Princess of Thieves  Action|Adventure|Children|Romance   
19372   100570               U.S. Seals                    Action|Thriller   
40524   158767              Orgasm Inc.                        Documentary   
64162   209557                    Nazar                              Drama   
26778   127015  The Man Who Lived Again                      Horror|Sci-Fi   

        imdbId  year  
43318   272790  2001  
19372   195366  2000  
40524  1439562  2009  
64162   102515  1990  
26778    27938  1936  


In [5]:
subset_df = merged_df.sample(5)

films_list = []

for index, row in subset_df.iterrows():
    film = Film(row["movieId"], row["imdbId"], row["title"], row["year"], row["genres"])
    films_list.append(film)

for film in films_list:
    print(f"Title: {film.title}, Year: {film.release_year}, Genres: {film.genres}")

Title: Black Clover: Sword of the Wizard King, Year: 2023, Genres: ['Action', 'Adventure', 'Animation', 'Fantasy']
Title: Killers, Year: 1997, Genres: ['Action', 'Horror', 'Thriller']
Title: Woman in the Dark, Year: 1934, Genres: ['Crime', 'Drama']
Title: ¡Three Amigos!, Year: 1986, Genres: ['Comedy', 'Western']
Title: The Last Mercenary, Year: 2021, Genres: ['Action', 'Comedy']


In [12]:
def get_films_by_title(title_query):
    films_list = []

    # If the query is None or empty, return an empty list
    if not title_query or len(title_query.strip()) == 0:
        return films_list

    # Read the CSV files
    df_movies = pd.read_csv("./Data/raw/movies.csv")
    df_links = pd.read_csv("./Data/raw/links.csv")

    # Merge the two DataFrames
    merged_df = pd.merge(df_movies, df_links, on="movieId")
    merged_df.drop(columns=["tmdbId"], inplace=True)

    # Extract the year and clean the title
    merged_df["year"] = merged_df["title"].str.extract(r"\((\d{4})\)")
    merged_df["title"] = merged_df["title"].str.replace(
        r"\s*\(\d{4}\)\s*", "", regex=True
    )

    # 🔍 **Filter by the search query (case-insensitive)**
    subset_df = merged_df[
        merged_df["title"].str.contains(title_query, case=False, na=False)
    ]

    # Build the list of Film objects
    for _, row in subset_df.iterrows():
        film = Film(
            row["movieId"], row["imdbId"], row["title"], row["year"], row["genres"]
        )
        films_list.append(film)

    # Update the global cache
    global film_cache
    film_cache = films_list

    return films_list

for film in get_films_by_title("the fast and"):
    print(f"Title: {film.title}, Year: {film.release_year}, Genres: {film.genres}")

Title: Tom and Jerry: The Fast and the Furry, Year: 2005, Genres: ['Animation', 'Children', 'Comedy']
Title: The Fast and the Fierce, Year: 2017, Genres: ['Action', 'Thriller']
