In [27]:
import polars as pl
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from ast import literal_eval

In [2]:
DATASET_PATH = "../data/25k_imdb_movie_dataset.csv"

In [3]:
def concatenate_list(list_):
    list_ = literal_eval(list_)
    return " ".join(list_)


def string_to_list(list_):
    list_ = literal_eval(list_)
    return list_

In [4]:
df_movies = (
    pl.scan_csv(DATASET_PATH)
    .with_columns(
        [
            pl.all().fill_null(" "),
            pl.col("Plot Kyeword").apply(concatenate_list).alias("keywords"),
            pl.col("Top 5 Casts").apply(concatenate_list).alias("stars"),
            pl.col("Generes").apply(string_to_list).alias("genres"),
            pl.col("Rating")
            .cast(pl.Float64(), strict=False)
            .fill_null(0)
            .alias("ratings"),
        ]
    )
    .drop(["Plot Kyeword", "Top 5 Casts", "Rating", "Generes"])
    .with_columns(pl.format("{} {} {}", "Overview", "keywords", "stars").alias("text"))
    .collect()
)

In [5]:
df_movies.head()

movie title,Run Time,User Rating,Overview,Director,Writer,year,path,keywords,stars,genres,ratings,text
str,str,str,str,str,str,str,str,str,str,list[str],f64,str
"""Top Gun: Maver…","""$170,000,000 (…","""187K""","""After more tha…","""Joseph Kosinsk…","""Jim Cash""","""-2022""","""/title/tt17459…","""fighter jet se…","""Jack Epps Jr. …","[""Action"", ""Drama""]",8.6,"""After more tha…"
"""Jurassic World…","""2 hours 27 min…","""56K""","""Four years aft…","""Colin Trevorro…","""Emily Carmicha…","""-2022""","""/title/tt80412…","""dinosaur juras…","""Colin Trevorro…","[""Action"", ""Adventure"", ""Sci-Fi""]",6.0,"""Four years aft…"
"""Top Gun""","""$15,000,000 (e…","""380K""","""As students at…","""Tony Scott""","""Jim Cash""","""-1986""","""/title/tt00920…","""pilot male cam…","""Jack Epps Jr. …","[""Action"", ""Drama""]",6.9,"""As students at…"
"""Lightyear""","""$71,101,257""","""32K""","""While spending…","""Angus MacLane""","""Angus MacLane""","""-2022""","""/title/tt10298…","""galaxy spacesh…","""Jason Headley …","[""Animation"", ""Action"", ""Adventure""]",5.2,"""While spending…"
"""Spiderhead""","""not-released""","""23K""","""In the near fu…","""Joseph Kosinsk…","""George Saunder…","""-2022""","""/title/tt97836…","""discover medic…","""Rhett Reese Pa…","[""Action"", ""Crime"", ""Drama""]",5.4,"""In the near fu…"


In [6]:
unique_genres = df_movies["genres"].explode().unique()
unique_genres

genres
str
"""Thriller"""
"""Comedy"""
"""Animation"""
"""Family"""
"""Fantasy"""
"""Game-Show"""
"""Horror"""
"""Sci-Fi"""
"""Reality-TV"""
"""Musical"""


In [7]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ["dlopen(/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 0x0006): tried: '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file), '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file)"]


In [8]:
type(df_movies["text"][0])

str

In [9]:
test = "This is a test"
model.encode(test).tolist()


[0.030612563714385033,
 0.013831321150064468,
 -0.020843880251049995,
 0.01632791757583618,
 -0.010231555439531803,
 -0.04798440635204315,
 -0.017313383519649506,
 0.03728753700852394,
 0.04588731750845909,
 0.034404996782541275,
 -0.01995979994535446,
 -0.04465906694531441,
 -0.013102784752845764,
 0.04284125939011574,
 -0.05539330840110779,
 -0.05897992104291916,
 0.013357806950807571,
 -0.04093955084681511,
 -0.04664018750190735,
 0.030635913833975792,
 0.03436756134033203,
 0.060174789279699326,
 -0.059834033250808716,
 0.017685677856206894,
 0.006318037398159504,
 -0.011531688272953033,
 -0.05604173615574837,
 0.02306281588971615,
 0.03552290424704552,
 -0.0007312542293220758,
 -0.004532997962087393,
 0.05712535232305527,
 0.06493887305259705,
 0.022896194830536842,
 0.03908282145857811,
 0.01584312506020069,
 0.07268378883600235,
 0.047734059393405914,
 0.008836328983306885,
 0.03844048082828522,
 0.017816293984651566,
 -0.09784720838069916,
 0.019852932542562485,
 0.026004800572

In [10]:
def get_embeddings(text):
    return model.encode(text).tolist()

In [11]:
df_movies = (
    df_movies.with_columns(pl.col("text").apply(get_embeddings).alias("embeddings"))
    .with_row_count(name="ids")
    .with_columns(pl.col("ids").cast(pl.Utf8))
)

In [12]:
df_movies.head()

ids,movie title,Run Time,User Rating,Overview,Director,Writer,year,path,keywords,stars,genres,ratings,text,embeddings
str,str,str,str,str,str,str,str,str,str,str,list[str],f64,str,list[f64]
"""0""","""Top Gun: Maver…","""$170,000,000 (…","""187K""","""After more tha…","""Joseph Kosinsk…","""Jim Cash""","""-2022""","""/title/tt17459…","""fighter jet se…","""Jack Epps Jr. …","[""Action"", ""Drama""]",8.6,"""After more tha…","[-0.070956, -0.009481, … 0.038678]"
"""1""","""Jurassic World…","""2 hours 27 min…","""56K""","""Four years aft…","""Colin Trevorro…","""Emily Carmicha…","""-2022""","""/title/tt80412…","""dinosaur juras…","""Colin Trevorro…","[""Action"", ""Adventure"", ""Sci-Fi""]",6.0,"""Four years aft…","[-0.025362, -0.061496, … -0.048473]"
"""2""","""Top Gun""","""$15,000,000 (e…","""380K""","""As students at…","""Tony Scott""","""Jim Cash""","""-1986""","""/title/tt00920…","""pilot male cam…","""Jack Epps Jr. …","[""Action"", ""Drama""]",6.9,"""As students at…","[-0.007393, 0.02565, … 0.049968]"
"""3""","""Lightyear""","""$71,101,257""","""32K""","""While spending…","""Angus MacLane""","""Angus MacLane""","""-2022""","""/title/tt10298…","""galaxy spacesh…","""Jason Headley …","[""Animation"", ""Action"", ""Adventure""]",5.2,"""While spending…","[-0.06645, -0.003073, … -0.02127]"
"""4""","""Spiderhead""","""not-released""","""23K""","""In the near fu…","""Joseph Kosinsk…","""George Saunder…","""-2022""","""/title/tt97836…","""discover medic…","""Rhett Reese Pa…","[""Action"", ""Crime"", ""Drama""]",5.4,"""In the near fu…","[-0.059968, 0.064897, … -0.003551]"


# Pinecone Search

In [13]:
import pinecone
import os

In [14]:
pinecone_api_key = os.environ["PINECONE_API_KEY"]

In [15]:
pinecone.init(api_key=pinecone_api_key, environment="gcp-starter")
index_name = "movies-embeddings"
all_index = pinecone.list_indexes()
if index_name in all_index:
    index = pinecone.Index(index_name)
else:
    dimension_embeddings = len(df_movies["embeddings"][0])
    pinecone.create_index(index_name, dimension=dimension_embeddings, metric="cosine")
    index = pinecone.Index(index_name)

In [16]:
from tqdm.auto import tqdm

batch_size = 64
for i in tqdm(range(0, len(df_movies), batch_size)):
    i_end = min(i + batch_size, len(df_movies))
    # Get batch of data
    batch = df_movies[i:i_end]
    # Generate embeddings for the batch
    ids = batch["ids"].to_list()
    emb = batch["embeddings"].to_list()
    metadata = batch.drop(["ids", "embeddings", "text"]).to_dicts()

    # Update or add values to the database
    to_upsert = list(zip(ids, emb, metadata))
    _ = index.upsert(vectors=to_upsert)

  0%|          | 0/382 [00:00<?, ?it/s]

In [17]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.24402,
 'namespaces': {'': {'vector_count': 24402}},
 'total_vector_count': 24402}

## Search Function

In [28]:
def search(query, genre, rating, top_k):
    query_vector = model.encode(query).tolist()

    filter_rating = rating if rating else 0
    conditions = {"ratings": {"$gte": filter_rating}}
    if genre:
        conditions["genres"] = {"$in": [genre]}

    responses = index.query(
        vector=query_vector, top_k=top_k, include_metadata=True, filter=conditions
    )

    data_responses = []
    for response in responses["matches"]:
        data_responses.append(
            {
                "Title": response["metadata"]["movie title"],
                "Overview": response["metadata"]["Overview"],
                "Director": response["metadata"]["Director"],
                "Genre": response["metadata"]["genres"],
                "Year": response["metadata"]["year"],
                "Rating": response["metadata"]["ratings"],
                "Score": response["score"],
            }
        )

    df_responses = pd.DataFrame(data_responses)
    return df_responses

In [29]:
search("a time travel adventure", None, None, 5)

Unnamed: 0,Title,Overview,Director,Genre,Year,Rating,Score
0,Timeline,A group of archaeologists become trapped in th...,Richard Donner,"[Action, Adventure, Sci-Fi]",-2003,5.6,0.669732
1,The Exotic Time Machine,A time machine transports two curious pleasure...,Felicia Sinclair,[Sci-Fi],-1998,3.6,0.668778
2,Norman,A time traveler and his A.I. companion. Norman...,Joel Guelzo,"[Adventure, Sci-Fi, Thriller]",(IV) (2019),3.6,0.663836
3,Zärtliche Chaoten II,3 chaotic unlucky fellows travel back in time ...,Holm Dressler,"[Comedy, Sci-Fi, Romance]",-1988,4.3,0.636989
4,A Sound of Thunder,"A single mistake in the past, by a time travel...",Peter Hyams,"[Action, Adventure, Horror]",-2005,4.2,0.636948


# Interface

In [31]:
import gradio as gr

# Define possible genres
genres = unique_genres.to_list()
# Create the interface
interface = gr.Interface(
    fn=search,
    inputs=[
        gr.Textbox(lines=5, placeholder="Escribe aqui tu consulta...", label="Consulta"),
        gr.Dropdown(choices=genres, label="Genero de la pelicula"),
        gr.Slider(minimum=1, maximum=10, value=5, label="Puntuacion minima"),
        gr.Number(minimum=1, maximum=10, value=3, label="Numero de resultados")
    ],
    outputs=gr.DataFrame(type="pandas", label="Resultados")
)

interface.launch()

Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


