In [4]:
# (a) Import necessary libraries
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("Libraries imported successfully.")


Libraries imported successfully.


In [6]:
# (b) Load the movies.csv dataset into a pandas DataFrame
df = pd.read_csv('movies.csv')

# Display the first 5 rows to confirm it loaded correctly
print("Dataset loaded. Here are the first 5 rows:")
display(df.head())

Dataset loaded. Here are the first 5 rows:


Unnamed: 0,title,plot
0,Spy Movie,A spy navigates intrigue in Paris to stop a te...
1,Romance in Paris,A couple falls in love in Paris under romantic...
2,Action Flick,A high-octane chase through New York with expl...


In [8]:
# (c) Initialize the sentence transformer model 'all-MiniLM-L6-v2'
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for the movie plots. This may take a minute.
print("Creating embeddings for all movie plots...")
plot_embeddings = model.encode(df['plot'].tolist(), show_progress_bar=True)

print("Embeddings created successfully.")

Creating embeddings for all movie plots...


Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.49it/s]

Embeddings created successfully.





In [10]:
def search_movies(query, top_n=5):
    """
    Searches for movies based on a query using semantic similarity.
    Returns a DataFrame with the top 'top_n' movies.
    """
    query_embedding = model.encode([query])
    similarities = cosine_similarity(query_embedding, plot_embeddings)[0]
    top_n_indices = np.argsort(similarities)[-top_n:][::-1]
    results_df = df.iloc[top_n_indices].copy()
    results_df['similarity_score'] = similarities[top_n_indices]
    
    return results_df

print("search_movies function defined.")

search_movies function defined.


In [12]:
# (e) Test the function with the query 'spy thriller in Paris'
test_query = 'spy thriller in Paris'

# Call the search function to get the top 5 results.
top_results = search_movies(test_query, top_n=5)

# Display the results.
print(f"Top 5 movie results for query: '{test_query}'")
display(top_results[['title', 'plot', 'similarity_score']])

Top 5 movie results for query: 'spy thriller in Paris'


Unnamed: 0,title,plot,similarity_score
0,Spy Movie,A spy navigates intrigue in Paris to stop a te...,0.769684
1,Romance in Paris,A couple falls in love in Paris under romantic...,0.38803
2,Action Flick,A high-octane chase through New York with expl...,0.256777
