In [3]:
#Install and import libraries
!pip install sentence-transformers pandas scikit-learn

# Import the libraries
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



In [4]:
# Load the movie dataset
movies_df = pd.read_csv("movies.csv")

# Display the first few rows
movies_df.head()


Unnamed: 0,title,plot
0,Spy Movie,A spy navigates intrigue in Paris to stop a te...
1,Romance in Paris,A couple falls in love in Paris under romantic...
2,Action Flick,A high-octane chase through New York with expl...


In [11]:
#Create embeddings using all-MiniLM-L6-v2
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode movie plots into numerical embeddings
embeddings = model.encode(
    movies_df['plot'].astype(str).tolist(),
    show_progress_bar=True
)
print("Embeddings shape:", embeddings.shape)

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.53it/s]

Embeddings shape: (3, 384)





In [14]:
#Implement search_movies(query, top_n)
def search_movies(query, top_n=10):
    """
    Search for the top_n most similar movies to the query
    using cosine similarity on the embeddings.
    """
    if not isinstance(query, str) or query.strip() == "":
        raise ValueError("Query must be a non-empty string.")
    
    query_embedding = model.encode([query])

    similarities = cosine_similarity(query_embedding, embeddings)[0]

    top_indices = np.argsort(similarities)[::-1][:top_n]

    results = movies_df.iloc[top_indices].copy()
    results['similarity'] = similarities[top_indices]

    return results[['title', 'plot', 'similarity']].reset_index(drop=True)


In [13]:
#Testing with query spy thriller in Paris
search_movies("spy thriller in Paris", top_n=5)

Unnamed: 0,title,plot,similarity
0,Spy Movie,A spy navigates intrigue in Paris to stop a te...,0.769684
1,Romance in Paris,A couple falls in love in Paris under romantic...,0.38803
2,Action Flick,A high-octane chase through New York with expl...,0.256777
