<img src="./figs/stages.png" height=100px/>

In [1]:
import warnings

warnings.filterwarnings("ignore")

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data").load_data()

In [2]:
print(documents[0])

Doc ID: 538bf634-b23e-4440-a2b0-d75101dfad90
Text: History of the University of Aveiro  The University of Aveiro
will be built to uphold the serene construction of the future and to
defend sacred and enduring values.  This is how the then Minister of
Education, José Veiga Simão began his inauguration speech of the first
Foundation Commission of the University of Aveiro (UA) on December 15,
1973,...


## 2. Index Stage

### 2.1 Chunking

In [3]:
from llama_index.core.node_parser import SentenceSplitter

sp = SentenceSplitter(chunk_size=5, chunk_overlap=0)
sp

SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f96a800d310>, id_func=<function default_id_func at 0x7f962d7e5d00>, chunk_size=5, chunk_overlap=0, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')

In [4]:
sp.split_text("Hello there! How are you?")

['Hello there!', 'How are you?']

### 2.2  Understand embeddings

In [5]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# loads BAAI/bge-small-en
# embed_model = HuggingFaceEmbedding()

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [6]:
embeddings = embed_model.get_text_embedding("Hello World!")
print(len(embeddings))
print(embeddings[:5])

384
[-0.0032757148146629333, -0.011690735816955566, 0.041559211909770966, -0.03814808651804924, 0.024183064699172974]


### 2.3 Nodes and vector stores

<img src="./figs/vector_store.png" height=640px/>

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex

# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=128, chunk_overlap=0),
        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ]
)

In [8]:
nodes = pipeline.run(documents=documents)

In [9]:
len(nodes)

245

In [10]:
dict(nodes[0]).keys()

dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator'])

In [11]:
print(nodes[0])

Node ID: 56311f29-8373-496f-bbab-8a30a92536ad
Text: History of the University of Aveiro  The University of Aveiro
will be built to uphold the serene construction of the future and to
defend sacred and enduring values.  This is how the then Minister of
Education, José Veiga Simão began his inauguration speech of the first
Foundation Commission of the University of Aveiro (UA) on December 15,
1973,...


In [12]:
nodes[0].relationships

{<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='538bf634-b23e-4440-a2b0-d75101dfad90', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/var/home/luminoso/clouds/github/dspt-handson-llamaindex/data/about_ua.txt', 'file_name': 'about_ua.txt', 'file_type': 'text/plain', 'file_size': 6110, 'creation_date': '2024-05-27', 'last_modified_date': '2024-05-27'}, hash='2213c56a82fd921e87701ca0b530c4831e08497aff7b31121c11970fa0f76dfc'),
 <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='f01030a9-3323-4d28-b3fb-3239b7d9c765', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='7f259bd03cd25e9009c132f25759a84f8e39cdfcb790552aa2f6f7e1a3f1cd72')}

In [13]:
nodes[0].metadata

{'file_path': '/var/home/luminoso/clouds/github/dspt-handson-llamaindex/data/about_ua.txt',
 'file_name': 'about_ua.txt',
 'file_type': 'text/plain',
 'file_size': 6110,
 'creation_date': '2024-05-27',
 'last_modified_date': '2024-05-27'}

In [14]:
nodes[0].embedding

[-0.060819316655397415,
 0.059816546738147736,
 0.04734884575009346,
 0.034013740718364716,
 0.012039312161505222,
 0.01607067883014679,
 -0.02744111977517605,
 0.003875291207805276,
 0.011437137611210346,
 0.02849864773452282,
 -0.01023443229496479,
 -0.050660498440265656,
 0.018774621188640594,
 0.012351024895906448,
 0.04082474485039711,
 0.05222167447209358,
 -0.11361880600452423,
 0.0319649837911129,
 0.025724416598677635,
 0.016889294609427452,
 0.0848429873585701,
 0.004742583259940147,
 0.010617236606776714,
 -0.0844719335436821,
 0.051317762583494186,
 0.0585886649787426,
 0.0225776806473732,
 -0.006246177479624748,
 0.03995770215988159,
 -0.12743878364562988,
 -0.012048729695379734,
 -0.021533651277422905,
 0.03701239079236984,
 -0.02206040360033512,
 0.02041030488908291,
 0.009294651448726654,
 0.03172161802649498,
 -0.015590917319059372,
 -0.041141875088214874,
 0.011595763266086578,
 0.005205161869525909,
 0.042350299656391144,
 -0.011899564415216446,
 -0.03762628883123398