In [2]:
import pandas as pd

# Load a small IMDb reviews dataset
# You can find sample datasets online or use the Hugging Face Datasets library
'''reviews = [
    "This movie was fantastic! I loved every moment of it.",
    "An absolute masterpiece with stunning visuals.",
    "Not my favorite movie, but it had its moments.",
    "Terrible plot and acting. I wouldn't recommend it.",
    "Quite enjoyable with a good storyline.",
]'''

reviews = [
    "This movie was an absolute masterpiece! The visuals and story captivated me from start to finish.",
    "An unforgettable experience with stunning visuals and a captivating story. Truly a masterpiece!",
    "I didn't enjoy the movie; the plot was too predictable and lacked originality.",
    "The movie had a predictable plot and felt like it was missing originality. Not enjoyable for me.",
    "A thrilling adventure with twists and turns that kept me on the edge of my seat!",
    "What a rollercoaster! The thrilling plot twists kept me engaged till the very end.",
    "The acting was phenomenal, and every scene was well-executed.",
    "Impressive performances from the cast, with each scene executed perfectly.",
    "A waste of time – I couldn’t sit through the entire movie.",
    "I couldn’t finish the movie; it was such a waste of time.",
    "Great film! The character development was strong, and the soundtrack was amazing.",
    "Loved it! The characters grew so well, and the music complemented the story perfectly.",
    "Not the best film I’ve seen, but the cinematography was beautiful.",
    "While the story wasn’t compelling, I appreciated the beautiful cinematography.",
    "A fun and lighthearted film that’s perfect for a relaxing evening.",
    "An easy-going movie with a charming vibe, great for unwinding after a long day.",
    "Incredible storyline with twists I didn’t see coming!",
    "An unexpected plot that took me by surprise! Amazing story!",
    "The film left me feeling emotional and deeply moved.",
    "A powerful and emotional film that resonated with me.",
    "It was a boring and drawn-out film with no real climax.",
    "Slow-paced and uneventful, this movie dragged without a proper climax.",
    "A cinematic masterpiece that deserves all the praise it’s been getting.",
    "This film is a true work of art and deserves every bit of acclaim.",
    "The plot was confusing and hard to follow, even for an avid moviegoer.",
    "Even as someone who loves complex plots, I found this one confusing and hard to follow.",
    "The suspense was well-built, and the payoff was worth it.",
    "I loved the suspense in this movie, and the climax delivered exactly what I hoped for.",
    "The comedy was on point and had me laughing the entire time.",
    "It was hilarious! I was laughing throughout thanks to the spot-on comedy."
]
df = pd.DataFrame(reviews, columns=['review'])
print(df.head())


                                              review
0  This movie was fantastic! I loved every moment...
1     An absolute masterpiece with stunning visuals.
2     Not my favorite movie, but it had its moments.
3  Terrible plot and acting. I wouldn't recommend...
4             Quite enjoyable with a good storyline.


In [3]:
import re

def preprocess(text):
    # Lowercase and remove special characters
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df['cleaned_review'] = df['review'].apply(preprocess)
print(df['cleaned_review'].head())


0    this movie was fantastic i loved every moment ...
1        an absolute masterpiece with stunning visuals
2         not my favorite movie but it had its moments
3      terrible plot and acting i wouldnt recommend it
4                quite enjoyable with a good storyline
Name: cleaned_review, dtype: object


In [4]:
from sentence_transformers import SentenceTransformer

# Load pre-trained SBERT model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode all reviews to get their embeddings
sbert_embeddings = sbert_model.encode(df['cleaned_review'].tolist())


  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
from transformers import DistilBertModel, DistilBertTokenizer
import torch

# Load pre-trained DistilBERT model
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def get_distilbert_embedding(text):
    inputs = distilbert_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Get embeddings for each review
distilbert_embeddings = [get_distilbert_embedding(text) for text in df['cleaned_review']]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
from transformers import AutoModel, AutoTokenizer

# Load pre-trained MiniLM model
minilm_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode reviews to embeddings
minilm_embeddings = minilm_model.encode(df['cleaned_review'].tolist())


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def semantic_search(query, embeddings, model, reviews):
    # Preprocess the query
    query = preprocess(query)
    
    # Get query embedding
    query_embedding = model.encode([query]) if model != distilbert_model else get_distilbert_embedding(query).reshape(1, -1)
    
    # Calculate cosine similarity between query and all review embeddings
    similarities = cosine_similarity(query_embedding, embeddings).flatten()
    
    # Find the index of the most similar review
    most_similar_idx = similarities.argmax()
    
    # Fetch the review and similarity score
    similar_review = reviews.iloc[most_similar_idx]
    similarity_score = similarities[most_similar_idx]
    
    # Print query and the most similar result
    print(f"Query: {query}")
    print(f"Most Similar Review: {similar_review}")
    print(f"Similarity Score: {similarity_score:.4f}\n")

# Example usage
query = "A masterpiece with stunning visuals and a gripping story."
semantic_search(query, sbert_embeddings, sbert_model, df['review'])

query = "The plot was confusing and hard to understand."
semantic_search(query, distilbert_embeddings, distilbert_model, df['review'])

query = "I couldn't stop laughing; the comedy was spot-on."
semantic_search(query, minilm_embeddings, minilm_model, df['review'])
