In [19]:
# import packages
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Data loading & cleaning

In [20]:
def load_data(file):
    """ Read the .csv file and parse the title and overview"""
    movies = pd.read_csv(file) # "movies_sample.csv"
    movies = movies[["original_title","overview"]]
    
    # Remove null and filler values in the overview
    movies = movies.dropna(subset=['overview'])
    movies = movies[movies["overview"]!="No overview found."]

    # Format the overview to lowercase
    movies["overview"] = movies["overview"].str.lower()

    return movies

# 2. Vectorization using TF-IDF

In [21]:
def vectorize_text(texts):
    """ Vectorizing text using TF-IDF"""
    # Define a TF-IDF Vectorizer Object
    tfidf = TfidfVectorizer(stop_words='english')

    # Construct the TF-IDF matrix
    tfidf_matrix = tfidf.fit_transform(texts)
    
    return tfidf, tfidf_matrix

# 3. Calculate cosine similarity

In [22]:
def compute_similarity(user_input, tfidf, tfidf_matrix):
    """ Compute the cosine similarity between overviews and user input """
    # Format the user input to lowercase
    user_input = user_input.lower()
    
    # Vectorize the user input
    user_vector = tfidf.transform([user_input])
    
    # Compute the cosine similarity
    cosine_sim = cosine_similarity(user_vector, tfidf_matrix)
    
    return cosine_sim

# 4. Print recommendation

In [23]:
def recommend_movies(user_input, movies, tfidf, tfidf_matrix, top_n=5):
    """ Print the recommended movies """
    # Calculate the cosine similarity and append to the movies dataframe
    cosine_sim = compute_similarity(user_input, tfidf, tfidf_matrix)
    movies["cosine_similarity"] = cosine_sim.flatten()
    
    # Ge the indices of the recommended movies
    top_indices = cosine_sim.argsort()[0][-top_n:][::-1]
    recommended_movies = movies.iloc[top_indices]

    # Print the recommended movies (title, cosine similarity)
    print(recommended_movies[["original_title", "cosine_similarity"]])

# 5. Parse user input

In [24]:
def parse_input():
    # Parse the movie description
    description = input("Please enter your movie description:")

    # Parse the number of movies to recommend
    while True:
        try:
            n_top = int(input("Please enter your number of movies to recommend:"))
            if n_top > 0:
                break
            else: # n_top <= 0
                print("The number needs to be at least 1.")
        except ValueError:
            print("Please enter an integer.")
    return description, n_top

# Display:

In [25]:
movies = load_data("movies_sample.csv")

# Vectorizing the movie overview
tfidf, tfidf_matrix = vectorize_text(movies['overview'])

# Prompt the user to enter the movie description and number of movies to recommend
description, n_top = parse_input()

# Print movie recommendation
recommend_movies(description, movies, tfidf, tfidf_matrix, top_n=5)

Please enter your movie description: I like action movies set in space.
Please enter your number of movies to recommend: 5


         original_title  cosine_similarity
221           Space Cop           0.149704
478  Monsters vs Aliens           0.146310
71            Dreamboat           0.138510
211               疯狂的赛车           0.123857
251         Šakalí léta           0.106885
