<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/milmor/NLP/blob/main/Notebooks/21_FAISS_hf.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
</table>

# FAISS  
- Game of thrones book: https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books

In [1]:
import torch
import pandas as pd
from datasets import Dataset

torch.__version__

'2.5.1+cu124'

## 1.- Dataset

In [2]:
path = './001ssb.txt'
book = open(path, 'rb').read().decode(encoding='utf-8').lower()

print(f'Words: {len(book)}')

Words: 1628063


In [3]:
import re

words = re.findall(r'\b\w+\b|[\.,;!?()"\']', book)

maxlen = 50
# Crear lotes de 50 palabras
sentences = [words[i:i + maxlen] for i in range(0, len(words), maxlen)]

In [4]:
sentences[0][:20]

['a',
 'game',
 'of',
 'thrones',
 'book',
 'one',
 'of',
 'a',
 'song',
 'of',
 'ice',
 'and',
 'fire',
 'by',
 'george',
 'r',
 '.',
 'r',
 '.',
 'martin']

In [5]:
len(sentences)

7367

In [6]:
my_dict = {
    "id": list(range(len(sentences))),
    "text": [" ".join(sentence) for sentence in sentences]
}

dataset = Dataset.from_dict(my_dict)
dataset

Dataset({
    features: ['id', 'text'],
    num_rows: 7367
})

In [7]:
dataset[0]

{'id': 0,
 'text': 'a game of thrones book one of a song of ice and fire by george r . r . martin prologue " we should start back , " gared urged as the woods began to grow dark around them . " the wildlings are dead . " " do the'}

## 2.- Model

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("jinaai/jina-embeddings-v2-base-en", 
                            trust_remote_code=True)

In [9]:
def embed(batch): # adds embedding column
    information = batch["text"]
    return {"embeddings" : model.encode(information)}

dataset = dataset.map(embed,batched=True,batch_size=16)

Map:   0%|          | 0/7367 [00:00<?, ? examples/s]

In [10]:
dataset

Dataset({
    features: ['id', 'text', 'embeddings'],
    num_rows: 7367
})

In [11]:
len(dataset[0]['embeddings'])

768

In [12]:
data = dataset.add_faiss_index("embeddings")

  0%|          | 0/8 [00:00<?, ?it/s]

## 3.- Search

In [13]:
def search(query: str, k: int = 3):
    embedded_query = model.encode(query) # embed new query
    scores, retrieved_examples = data.get_nearest_examples( 
        "embeddings", embedded_query, # compare our new embedded query with the dataset embeddings
        k=k # get only top k results
    )
    return scores, retrieved_examples

In [14]:
scores , result = search("tyrion loves", 5) 
result['text']

["or his hand ? tyrion had no illusions about the king ' s love for his sister . if cersei kept her wits about her , she would insist the king sit in judgment of tyrion himself . even ned stark could scarcely object to that , not without impugning",
 "t love you for it . and his ascent will mean war . stannis cannot rest easy on the throne until cersei and her bastards are dead . do you think lord tywin will sit idly while his daughter ' s head is measured for a spike ? casterly rock",
 'affection or respect , and for that tyrion was willing to forgive him most anything . a servant approached . " bread , " tyrion told him , " and two of those little fish , and a mug of that good dark beer to wash them down . oh',
 'love him , father , i truly truly do , i love him as much as queen naerys loved prince aemon the dragonknight , as much as jonquil loved ser florian . i want to be his queen and have his babies . " " sweet one , " her',
 ". an archer so armed can outrange any wooden bow . ty

In [15]:
scores , result = search("stark loves", 5) 
result['text']

['lady stark . littlefinger has never loved anyone but littlefinger , and i promise you that it is not your hand that he boasts of , it \' s those ripe breasts of yours , and that sweet mouth , and the heat between your legs . " kurleket grabbed',
 '. ser alliser \' s onyx eyes were fixed on jon snow . " it would seem our bastard is in love , " he said as jon helped the fat boy to his feet . " show me your steel , lord snow . " jon drew his longsword',
 '. winterfell was such a vast place . " keep him off the walls , then , " she said bravely . " you know how bran loves to climb . " ned kissed the tears from her eyes before they could fall . " thank you , my lady',
 'stark . too long in the north , all the juices have frozen inside you . well , mine are still running . " he slapped his chest to prove it . " you are the king , " ned reminded him . " i sit on the damn iron',
 "robert dies and joff takes the throne ? and the sooner that comes to pass , the safer we ' ll all be . my husban