# Busca Inteligente de Filmes

Busca de filmes a partir de palavras-chave ou sentenças. O processo é composto por quatro etapas principais: i) enriquecimento de termos; busca semântica; iii) busca léxica; iv) ranqueamento.

## Enriquecimento de termos
O modelo gpt-3.5-turbo é empregado para determinar o filme que melhor corresponde aos termos fornecidos. Para esse filme, são gerados o título, sinpose curta e palavras-chave.

## Busca semântica (vetorial)
Busca vetorial por similaridade cosseno considerando embeddings gerados pelo método [sentence transformers](https://www.sbert.ne), modelo **paraphrase-multilingual-MiniLM-L12-v2**

## Busca léxica
Implementação do algoritmo BM25 disponível no Elasticsearch.

## Ranqueamento
O algoritmo Reciprocal Rank Fusion ([RRF](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf)) é empregado para ranquear os documentos oriundos das buscas semântica e léxica.



#Instalação de Dependências

In [None]:
!pip -q install gradio
!pip -q install openai tiktoken langchain
!pip -q install sentence-transformers
!pip -q install elasticsearch

# Instalação Local do Elasticsearch

In [None]:
%%bash

rm -rf elasticsearch*
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.11.1-linux-x86_64.tar.gz
tar -xzf elasticsearch-8.11.1-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-8.11.1/

# only Google Colab instances
umount /sys/fs/cgroup
apt install cgroup-tools

In [None]:
%%bash --bg

sudo -H -u daemon elasticsearch-8.11.1/bin/elasticsearch

In [None]:
!/content/elasticsearch-8.11.1/bin/elasticsearch-setup-passwords auto

In [None]:
!curl --cacert /content/elasticsearch-8.11.1/config/certs/http_ca.crt -u elastic -H 'Content-Type: application/json' -XGET https://localhost:9200/?pretty=true

# Constantes

Constantes relativas à configuração do Elasticsearch, além da API Key para utilização dos serviços da OpenAI.

In [None]:
#ElasticSearch CA Certificate
CA_CERT = "/content/elasticsearch-8.11.1/config/certs/http_ca.crt"

#ElasticSearch Password
ELASTIC_PASSWORD = "Elasticsearch password"

#ElasticSearch Indexes
BM25_INDEX = "bm25-ranking"
SEMANTIC_SEARCH_INDEX = "semantic-search"

#Lexical Search Settings
BM25_CONFIG = {
    "settings": {
      "analysis": {"analyzer": {"default": {"type": "standard"}}},
      "similarity": {
          "custom_bm25": {
              "type": "BM25",
              "k1": 2.0,
              "b": 0.75,
          }
      },
  },
  "mappings": {
      "properties": {
          "content": {
              "type": "text",
              "similarity": "custom_bm25",  # Use the custom BM25 similarity
          }
      }
  }
}

#OpenAI API Key
OPENAI_API_KEY = "OpenAI API Key"

# Modelo de Embedding

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Ingestão do Catálogo de Filmes
Criação dos índices no Elasticsearch para as buscas semânticas e léxicas.

In [None]:
import pandas as pd
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import ElasticsearchStore
from langchain.retrievers import ElasticSearchBM25Retriever
from langchain_core.retrievers import BaseRetriever
from elasticsearch import Elasticsearch


catalog = pd.read_json('/content/drive/catalog.json')

def format_catalog_row(row):
  return f'''
  title: {row['title']}
  director: {row['director']}
  year: {row['year']}
  actors: {', '.join(row['actors'][:10])}
  fullDescription: {row['fullDescription']}
  genreList: {', '.join(row['genreList'])}
  moods: {', '.join(row['moods'])}'''


def ingest_data(es_url: str = "https://localhost:9200") -> list[BaseRetriever]:
  texts = catalog.apply(lambda x: format_catalog_row(x), axis=1).to_list()

  # Create an Elasticsearch client instance
  es_client = Elasticsearch(
      es_url,
      ca_certs=CA_CERT,
      basic_auth=("elastic", ELASTIC_PASSWORD)
  )

  if es_client.indices.exists(index=SEMANTIC_SEARCH_INDEX):
    es_client.indices.delete(index=SEMANTIC_SEARCH_INDEX)

  vector_store = ElasticsearchStore(
      embedding = hf,
      index_name = SEMANTIC_SEARCH_INDEX,
      es_connection=es_client,
  )
  vector_store.add_texts(texts)


  if es_client.indices.exists(index = BM25_INDEX):
    es_client.indices.delete(index = BM25_INDEX)

  es_client.indices.create(index=BM25_INDEX, mappings=BM25_CONFIG["mappings"], settings=BM25_CONFIG["settings"])
  bm25_retriver = ElasticSearchBM25Retriever(client=es_client, index_name = BM25_INDEX)
  bm25_retriver.add_texts(texts)

  return [vector_store.as_retriever(), bm25_retriver]

In [None]:
retrievers = ingest_data()

# Enriquecimento de Termos

In [None]:
from langchain.chains.openai_functions import create_structured_output_runnable
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(
    model="gpt-3.5-turbo-1106",
    temperature=0,
    openai_api_key=OPENAI_API_KEY
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Você é uma assistente especialista em filmes.",
        ),
        (
            "human",
            "Utilize o formato provido para recomendar um filme partir da seguinte query: {query}",
        ),
        ("human", "Assegure-se que o filme recomendado siga o formato correto!"),
    ]
)

json_schema = {"title": "Filme",
    "description": "Informações básicas sobre o filme recomendado.",
    "type": "object",
    "properties": {
        "nome": {"title": "Título", "description": "Título do filme", "type": "string"},
        "sinopse": {"title": "Sinpose", "description": "sinpose curta do filme (máximo 50 caracteres)", "type": "string"},
        "keywords": {"title": "Keyoword", "description": "palavras-chave descrevendo o filme (atores, diretor, gênero)", "type": "string"}
    },
    "required": ["title", "description"],
}

query_enricher = create_structured_output_runnable(json_schema, llm, prompt)

In [None]:
query_enricher.invoke({'query': 'arqueólogo'})

# Busca Híbrida
Resultados das buscas semântica e híbrida são fundidos em uma única lista ordenada por relevância.

In [None]:
from tqdm.asyncio import tqdm

async def run_queries(queries: list[str], retrievers):
    """Run queries against retrievers."""
    tasks = []
    query_list = []
    for query in queries:
        for i, retriever in enumerate(retrievers):
            query_list.append(query)
            tasks.append(retriever.aget_relevant_documents(query))

    task_results = await tqdm.gather(*tasks)
    results_dict = {}
    for i, (query, query_result) in enumerate(zip(query_list, task_results)):
        results_dict[(query, i)] = query_result

    return results_dict

In [None]:
async def get_relevant_movies(query: str, k: int = 5):
  movie = query_enricher.invoke({"query": query})
  queries = [movie['nome'], movie['sinopse'], movie['keywords']]

  results = await run_queries(queries, retrievers)
  fused_scores = {}
  # RRF
  for result in results.values():
    for rank, doc in enumerate(result):
      if doc.page_content not in fused_scores:
        fused_scores[doc.page_content] = 0.0
      fused_scores[doc.page_content] += 1.0 / (rank + 60.0)

  reranked_results = dict(
      sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
  )
  return [movie.split('\n')[1].strip()[7:] for movie in reranked_results][:k]


# Intuição - Busca Semântica

In [None]:
from langchain.embeddings import OpenAIEmbeddings

openai_emb = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

genres = ['terror', 'comédia', 'infantil', 'aventura', 'drama', 'documentário']

df_genres = pd.concat(
    [
        catalog[catalog.genreList.apply(lambda x: len(x) > 0 and x[0] == genre)].sample(100)
        for genre in genres
    ]
)
df_genres = df_genres[['title', 'genreList', 'fullDescription']]
df_genres['mainGenre'] = df_genres['genreList'].apply(lambda x: x[0])
df_genres['embeddings'] = openai_emb.embed_documents(df_genres['fullDescription'].to_list())

In [None]:
import numpy as np
from sklearn.manifold import TSNE
embeddings = np.array([np.array(v) for v in df_genres['embeddings']])
embeddings = TSNE(n_components=2).fit_transform(embeddings)

df_genres['C1'] = embeddings[:, 0]
df_genres['C2'] = embeddings[:, 1]

In [None]:
import plotly.express as px
fig = px.scatter(df_genres, x="C1", y="C2", color="mainGenre", hover_data=['title'])
fig.show()

# Protótipo de Interface usando Gradio

In [None]:
# Front end web app
import gradio as gr

def get_movie_card(id: str):
  img_url = catalog.loc[id, 'image']
  return f"""**{catalog.loc[id, 'title'].upper()}**
   {catalog.loc[id, 'year']} • {', '.join(catalog.loc[id, 'genreList'][:5])}{
       ' • ' +  ', '.join(catalog.loc[id, 'moods'][:5]) if len(catalog.loc[id, 'moods']) > 0 else ''}
  **Diretor**
  {catalog.loc[id, 'director']}
  **Sinpose**
   {catalog.loc[id, 'fullDescription']}
  **Elenco**
   {', '.join(catalog.loc[id, 'actors'][:5])}
   ![Movie Poster]({img_url})
  """


with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")
    chat_history = []

    async def user(user_message, history):
        response = await get_relevant_movies(user_message)
        history.append((user_message, f"Veja se você gosta destes títulos aqui:"))
        for movie in response:
          history.append((None, get_movie_card(movie)))

        return gr.update(value=""), history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False)
    clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch(debug=True)