# 1. Imports and Environment Setup

In [36]:
import ast
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import re
import torch
import numpy as np
from numpy import dot
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    pipeline
)
from rapidfuzz import process, fuzz
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score
import ipywidgets as widgets
from IPython.display import display, HTML
import logging
logging.disable(logging.WARNING)

# 2. Data Loading and Preparation

In [39]:
df = pd.read_csv("Movies.csv")
def parse_genre_list(genre_string):
    return ast.literal_eval(genre_string)

df["genres"] = df["genres"].apply(parse_genre_list)

In [41]:
def make_movie_text(row):
    return row["title"] + " " + row["overview"]

df["combined_text"] = df.apply(make_movie_text, axis=1)

# 3. Generating Sentence Embeddings
We use the SentenceTransformer model "all-MiniLM-L6-v2" to convert each movie's combined text (title + overview) into a vector representation, which allows us to perform semantic searches.


In [44]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embeddings = embedding_model.encode(df["combined_text"].tolist(), show_progress_bar=True)
df["embedding"] = list(embeddings)


Batches:   0%|          | 0/267 [00:00<?, ?it/s]

# 4. Summarization Pipeline
We set up a Hugging Face summarization pipeline using the "distilbart-cnn-12-6" model. 
- `summarize_overview` checks if the overview text is short; if so, it trims it. Otherwise, it generates a summary.


In [47]:
summarizer = pipeline(
    "summarization", 
    model="sshleifer/distilbart-cnn-12-6", 
    tokenizer="sshleifer/distilbart-cnn-12-6",
    device=0 if torch.cuda.is_available() else -1
)

def summarize_overview(text, max_length=60, min_length=10):
    if not text or len(text.split()) < 30:
        return text[:150] + ("..." if len(text) > 150 else "")
    
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]["summary_text"]


# 5. Synonyms for Genres and Intents
We define dictionaries mapping canonical genre/intent names to various synonyms. 
- `Example`: "action": ["action", "actions", "Action"].
These help us handle different user phrasings in fuzzy matching.


In [50]:
intent_synonyms = {
    "best": ["best", "top", "top rated", "highest rated"],
    "newest": ["newest", "latest", "recent"],
    "popular": ["most popular", "popular", "trending", "famous"],
}
genre_synonyms = {
    "action": ["action", "actions","Action"],
    "adventure": ["adventure", "adventurous"],
    "comedy": ["comedy", "comedies", "funny"],
    "drama": ["drama", "dramatic", "dramas"],
    "fantasy": ["fantasy", "fantasies"],
    "science fiction": ["science fiction", "scifi", "sci-fi", "s-f", "sci fi"],
    "romance": ["romance", "romantic", "romcom", "rom-com"],
    "horror": ["horror", "horrors", "scary"],
    "thriller": ["thriller", "thrilling"],
    "crime": ["crime", "criminal"],
    "animation": ["animation", "animated", "cartoon"],
    "mystery": ["mystery", "mysteries"],
    "family": ["family"],
    "war": ["war", "wars"],
}

# 6. Fuzzy Matching Functions
Two functions using `rapidfuzz.process.extractOne` help us:
- **match_genre_fuzzy**: Identify which genre best matches the user's input.
- **match_intent_fuzzy**: Identify if the user is looking for "best", "newest", or "popular" movies.


In [53]:
def match_genre_fuzzy(user_text, threshold=80):
    flattened = []
    for canon, syn_list in genre_synonyms.items():
        for s in syn_list:
            flattened.append((canon, s))

    best_match = process.extractOne(
        user_text, [fs[1] for fs in flattened], 
        scorer=fuzz.token_set_ratio
    )
    if best_match and best_match[1] >= threshold:
        matched_synonym = best_match[0]
        for (canon_genre, syn) in flattened:
            if syn == matched_synonym:
                return canon_genre
    return None

In [55]:
def match_intent_fuzzy(user_text, threshold=80):
    flattened_intents = []
    for canon, syn_list in intent_synonyms.items():
        for s in syn_list:
            flattened_intents.append((canon, s))

    best_match = process.extractOne(
        user_text, [fi[1] for fi in flattened_intents],
        scorer=fuzz.token_set_ratio
    )
    if best_match and best_match[1] >= threshold:
        matched_syn = best_match[0]
        for (canon_intent, syn) in flattened_intents:
            if syn == matched_syn:
                return canon_intent
    return None

# 7. Cosine Similarity and Semantic Search
- **cosine_similarity**: Calculates the cosine similarity between two vectors.
- **semantic_search**: Encodes the query, compares it with each movie's embedding, and returns the top matches.


In [58]:
def cosine_similarity(a, b):
    return dot(a, b) / (norm(a) * norm(b))

def semantic_search(df, query, embedding_model, top_n=10):
    query_emb = embedding_model.encode(query)
    sims = []
    for i, emb in enumerate(df["embedding"]):
        sim = cosine_similarity(query_emb, emb)
        sims.append((i, sim))
    sims.sort(key=lambda x: x[1], reverse=True)
    top_indices = [idx for (idx, _) in sims[:top_n]]
    return df.iloc[top_indices]

# 8. Main Search Function
The `search` function orchestrates how we interpret the user's query:
1. Converts the query to lowercase, determines if it's asking about multiple "movies" or just one.
2. Fuzzy matches for intent (best/newest/popular) and genre.
3. Filters the DataFrame by genre if found.
4. Sorts by appropriate columns or defaults to semantic search.
5. Generates an output list of result dictionaries, each containing a title, genres, summary, etc.


In [61]:
def search(df, query, embedding_model, top_n=10):
    query_lower = query.lower()
    if "movies" in query_lower:
        n_results = top_n
    else:
        n_results = 1

    intent = match_intent_fuzzy(query_lower)
    matched_genre = None
    words = query_lower.split()
    for w in words:
        g = match_genre_fuzzy(w)
        if g:
            matched_genre = g
            break
    if matched_genre:
        df_filtered = df[df["genres"].apply(
            lambda glist: any(g.capitalize() == matched_genre.capitalize() for g in glist)
        )]
    else:
        df_filtered = df
    print(df_filtered)
    print(df['genres'] == matched_genre.capitalize())
    if intent == "best":
        sort_col = "vote_average"
        ascending = False
    elif intent == "newest":
        sort_col = "release_date"
        ascending = False
    elif intent == "popular":
        sort_col = "popularity"
        ascending = False
    else:
        sort_col = None

    if sort_col:
        if len(df_filtered) > 0:
            if sort_col == "release_date" and df_filtered[sort_col].dtype == object:
                df_filtered[sort_col] = pd.to_datetime(df_filtered[sort_col], errors="coerce")
            df_filtered = df_filtered.sort_values(by=sort_col, ascending=ascending)
            results = df_filtered.head(n_results)
            if len(results) == 0:
                results = semantic_search(df, query, embedding_model, top_n=n_results)
        else:
            results = semantic_search(df, query, embedding_model, top_n=n_results)
    else:
        results = semantic_search(df_filtered, query, embedding_model, top_n=n_results)

    output = []
    for _, row in results.iterrows():
        short_summary = summarize_overview(row["overview"])  # Summaries
        item = {
            "title": row["title"],
            "genres": row["genres"],
            "summary_of_overview": short_summary
        }
        if "vote_average" in row:
            item["vote_average"] = row["vote_average"]
        if "release_date" in row:
            item["release_date"] = str(row["release_date"])
        if "popularity" in row:
            item["popularity"] = row["popularity"]
        output.append(item)
    
    return output

# 9. Display the Interactive UI

In [64]:
df_movies = df 
embedding_model = embedding_model

# 1. Create widgets
query_input = widgets.Text(
    value='',
    placeholder='Type your query here, e.g. "Best drama movie"',
    description='Query:',
    layout=widgets.Layout(width='400px')
)

search_button = widgets.Button(
    description='Search',
    button_style='primary'
)

results_output = widgets.Output()

# 2. Define callback
def on_search_clicked(b):
    results_output.clear_output()
    user_query = query_input.value.strip()
    
    if not user_query:
        with results_output:
            display(HTML("<p style='color:red;'>Please enter a query.</p>"))
        return
    
    # Call your real search function
    results = search(df_movies, user_query, embedding_model, top_n=10)
    
    with results_output:
        if not results:
            display(HTML("<p>No results found.</p>"))
        else:
            for idx, movie in enumerate(results, start=1):
                title = movie.get("title", "Unknown Title")
                genres = movie.get("genres", [])
                summary = movie.get("summary_of_overview", "")
                vote = movie.get("vote_average", "")
                release_date = movie.get("release_date", "")
                popularity = movie.get("popularity", "")

                display(HTML(f"""
                <h3>{idx}. {title}</h3>
                <p><strong>Genres:</strong> {", ".join(genres)}</p>
                {"<p><strong>Rating:</strong> " + str(vote) + "</p>" if vote else ""}
                {"<p><strong>Release Date:</strong> " + str(release_date) + "</p>" if release_date else ""}
                {"<p><strong>Popularity:</strong> " + str(popularity) + "</p>" if popularity else ""}
                <p>{summary}</p>
                <hr>
                """))

search_button.on_click(on_search_clicked)

# 3. Display UI
ui_box = widgets.HBox([query_input, search_button])
display(ui_box)
display(results_output)


HBox(children=(Text(value='', description='Query:', layout=Layout(width='400px'), placeholder='Type your query…

Output()