# RAG System

In [None]:
# !pip install datasets sentence_transformers tqdm transformers

In [None]:
import pickle
from typing import List, Callable, Iterator
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Introducción

En esta práctica implementarás un sistema RAG para la aumentación de información sobre películas. El objetivo es incorporar contexto de películas a los queries que realice el usuario. Para esto, tienes que descargar el conjunto de datos `The Movies Dataset` disponible [aquí](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?select=movies_metadata.csv).

Para la generación de los embeddings, utilizarás modelos pre-entrenados basados en transformers con ayuda del paquete `sentence_transformers`, [aquí](https://sbert.net/index.html) puedes encontrar la documentación del paquete. De forma específica, harás uso del modelo `thenlper/gte-large` cuya documentación la puedes encontrar [aquí](https://huggingface.co/thenlper/gte-large).

# 2. Lectura del Conjunto de Datos

In [None]:
def load_dataset(filepath: str) -> pd.DataFrame:
    """Carga el conjunto de datos `The Movies Dataset`, en específico el archivo
    `movies_metadata.csv`. Realiza los siguientes pasos:
        - Se seleccionan las columnas `title`, `overview` y `budget`.
        - Se eliminan todas las filas que tengan `nans`.
        - Se reinicia el índice.
    :return: Un DataFrame de pandas con el conjunto de datos.
    """
    dataset = pd.read_csv(filepath, usecols=["title", "overview", "budget"])
    dataset = dataset.dropna()
    dataset = dataset.reset_index(drop=True)
    return dataset


dataset = load_dataset("movies_metadata.csv")
print(f"len(dataset): {len(dataset):,}")  # len(dataset): 44,506
dataset.head(5)

len(dataset): 44,506


Unnamed: 0,budget,overview,title
0,30000000,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,65000000,When siblings Judy and Peter discover an encha...,Jumanji
2,0,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,16000000,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,0,Just when George Banks has recovered from his ...,Father of the Bride Part II


# 3. Implementación del Sistema RAG

In [None]:
"""
SentenceTransformer carga un modelo de tipo SentenceTransformer
https://sbert.net/docs/package_reference/SentenceTransformer.html
desde Hugging Face HUB. Esta clase de modelo tiene un método `encode` que
recibe una cadena de caracteres o una lista de estas y computa sus
embeddings.

Ejemplo de uso:
>>> from sentence_transformers import SentenceTransformer
>>> model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
>>> sentence = ['This framework generates embeddings for each input sentence']
>>>embedding = model.encode(sentence)
"""

access_token = "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"
embedding_model = SentenceTransformer("thenlper/gte-large", token=access_token)

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
dataset = dataset.loc[:9]
dataset.head(10)

Unnamed: 0,budget,overview,title
0,30000000,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,65000000,When siblings Judy and Peter discover an encha...,Jumanji
2,0,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,16000000,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,0,Just when George Banks has recovered from his ...,Father of the Bride Part II
5,60000000,"Obsessive master thief, Neil McCauley leads a ...",Heat
6,58000000,An ugly duckling having undergone a remarkable...,Sabrina
7,0,"A mischievous young boy, Tom Sawyer, witnesses...",Tom and Huck
8,35000000,International action superstar Jean Claude Van...,Sudden Death
9,58000000,James Bond must unmask the mysterious head of ...,GoldenEye


In [None]:
def get_embedding(text: str) -> List[float]:
    """Convierte una cadena de texto a un embedding
    :param text: El texto a convertir en embedding.
    :return: El embedding del texto.
    """
    embedding = embedding_model.encode(text)
    return embedding.tolist()


def apply_embeddings(func: Callable[[str], List[float]], data: Iterator[str]):
    """Genera los embeddings de forma concurrente utilizando hilos.
    La documentación de `ThreadPoolExecutor` y del método `map` la puedes
    encontrar aquí https://docs.python.org/3/library/concurrent.futures.html.
    """
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(func, data), total=len(data)))
    return results


def write_dataset_embeddings(filename: str, dataset_embeddings: List[List[float]]) -> None:
    """Dado que los embeddings tardan mucho en generarse creamos esta función
    para guardarlos utilizando el módulo ´pickle´ de la librería estandar de
    Python.
    """
    with open(filename, "wb") as f:
        pickle.dump(dataset_embeddings, f)


def load_dataset_embeddings(filename: str) -> List[List[float]]:
    """Carga los embeddings precomputados y guardados con `pickle`."""
    with open(filename, "rb") as f:
        dataset_embeddings = pickle.load(f)


# Apply get_embedding using multiple threads
dataset_embeddings = apply_embeddings(get_embedding, dataset["overview"])
dataset["embedding"] = dataset_embeddings

100%|██████████| 10/10 [00:04<00:00,  2.43it/s]


In [None]:
def get_relevant_dataset(
        dataset: pd.DataFrame,
        query_embedding: List[float],
        k: int = 3
    ) -> pd.DataFrame:
    """Regresa un subconjunto del dataset que solo contiene las entradas más
    relevantes para el query del usuario.
    """
    index_to_similarity = {}
    for index, embedding in dataset["embedding"].items():
        index_to_similarity[index] = np.dot(query_embedding, embedding)
    top_k_indices = sorted(index_to_similarity, key=index_to_similarity.get, reverse=True)[:k]
    return dataset.loc[top_k_indices]


def get_search_result(query: str, dataset: pd.DataFrame) -> str:
    """Obtiene el resultado de la búsqueda con la que vamos a aumentar al query
    del usuario.
    :param query: Query del usuario.
    :return: Una cadena de caracteres con el siguiente formato:
        >>> Title: Toy Story, Budget: 30000000, Plot: Led by Woody, Andy's toys...
        >>> Title: Jumanji, Budget: 65000000, Plot: When siblings Judy and Peter...
        >>> Title: Grumpier Old Men	, Budget: 0, Plot: A family wedding reignites...
        Se utilizá `...` en los ejemplos por brevedad. El número de líneas
        regresadas está determinado por el argumento `k` de la función
        `get_relevant_dataset`.
    """
    query_embedding = get_embedding(query)
    relevant_dataset = get_relevant_dataset(dataset, query_embedding)
    search_result = ""
    for _, row in relevant_dataset.iterrows():
        search_result += (
            f"Title: {row['title']}, Budget: {row['budget']}, Overview: "
            f"{row['overview']}\n")
    return search_result

In [None]:
# Hacemos un query al modelo y vemos su versión aumentada.
query = "What was the budget of the movie Father of the Bride Part II?"
search_result = get_search_result(query, dataset)
augmented_query = (
    f"Query: {query}\nContinue to answer the query by using the Search "
    f"Results:\n{search_result}."
)

print(augmented_query)

Query: What was the budget of the movie Father of the Bride Part II?
Continue to answer the query by using the Search Results:
Title: Father of the Bride Part II, Budget: 0, Overview: Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.
Title: Grumpier Old Men, Budget: 0, Overview: A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.
Title: Sudden Death, Budget: 35000000, Overview: International action superstar Jean Claude Van Damme teams with Powers Boo

In [None]:
# Hacemos un query al modelo y vemos su versión aumentada.
query = "What is the best movie for kids and why?"
search_result = get_search_result(query, dataset)
augmented_query = (
    f"Query: {query}\nContinue to answer the query by using the Search "
    f"Results:\n{search_result}."
)

print(augmented_query)

Query: What is the best movie for kids and why?
Continue to answer the query by using the Search Results:
Title: Jumanji, Budget: 65000000, Overview: When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.
Title: Toy Story, Budget: 30000000, Overview: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
Title: Tom and Huck, Budget: 0, Overview: A mischievous young boy, Tom Sawyer, witnesses a murder by the

In [None]:
# Hacemos un query al modelo y vemos su versión aumentada.
query = "What is the best romantic movie to watch and why?"
search_result = get_search_result(query, dataset)
augmented_query = (
    f"Query: {query}\nContinue to answer the query by using the Search "
    f"Results:\n{search_result}."
)

print(augmented_query)

Query: What is the best romantic movie to watch and why?
Continue to answer the query by using the Search Results:
Title: Waiting to Exhale, Budget: 16000000, Overview: Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.
Title: Grumpier Old Men, Budget: 0, Overview: A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.
Title: Sabrina, Budget: 58000000, Overview: An ugly duckling having undergone a remarkable change, still harbors feelings for her crush: a carefree playboy, but not before his business-f

# 4. Integración de Sistemas

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", token=access_token)
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", token=access_token)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
input_ids = tokenizer(augmented_query, return_tensors="pt")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: What is the best romantic movie to watch and why?
Continue to answer the query by using the Search Results:
Title: Waiting to Exhale, Budget: 16000000, Overview: Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive "good man" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.
Title: Grumpier Old Men, Budget: 0, Overview: A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.
Title: Sabrina, Budget: 58000000, Overview: An ugly duckling having undergone a remarkable change, still harbors feelings for her crush: a carefree playboy, but not before his busin

In [None]:
input_ids = tokenizer(augmented_query, return_tensors="pt")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: What is the best movie for kids and why?
Continue to answer the query by using the Search Results:
Title: Jumanji, Budget: 65000000, Overview: When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.
Title: Toy Story, Budget: 30000000, Overview: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
Title: Tom and Huck, Budget: 0, Overview: A mischievous young boy, Tom Sawyer, witnesses a murder b

In [None]:
input_ids = tokenizer(augmented_query, return_tensors="pt")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: What was the budget of the movie Father of the Bride Part II?
Continue to answer the query by using the Search Results:
Title: Father of the Bride Part II, Budget: 0, Overview: Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.
Title: Grumpier Old Men, Budget: 0, Overview: A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.
Title: Sudden Death, Budget: 35000000, Overview: International action superstar Jean Claude Van Damme teams with Power