## Bibliotecas

In [1]:
import pathlib, glob
from haystack.nodes import PDFToTextConverter, PreProcessor,EmbeddingRetriever
from haystack.nodes.base import BaseComponent
from haystack.document_stores import FAISSDocumentStore
from haystack.pipelines import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


## Pré-Processamento

In [3]:
document_store = FAISSDocumentStore(similarity="cosine")
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["pt"])
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="sentence",
    split_respect_sentence_boundary=False)
retriever = EmbeddingRetriever(
    document_store=document_store, 
    embedding_model="sentence-transformers/distiluse-base-multilingual-cased-v1")

Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 188kB/s]
Downloading (…)5f450/.gitattributes: 100%|██████████| 690/690 [00:00<00:00, 2.77MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 205kB/s]
Downloading (…)/2_Dense/config.json: 100%|██████████| 114/114 [00:00<00:00, 185kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.58M/1.58M [00:00<00:00, 2.12MB/s]
Downloading (…)966465f450/README.md: 100%|██████████| 2.38k/2.38k [00:00<00:00, 3.83MB/s]
Downloading (…)6465f450/config.json: 100%|██████████| 556/556 [00:00<00:00, 916kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 201kB/s]
Downloading pytorch_model.bin: 100%|██████████| 539M/539M [01:09<00:00, 7.81MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 201kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 311kB/s]
Downloading (…)5f450/tokenizer.json: 100%|██████████| 1.96M/

In [4]:
indexing_pipeline = Pipeline()
indexing_pipeline.add_node(component=converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["TextConverter"])

NameError: name 'Pipeline' is not defined

In [None]:
files_to_index = [f for f in pathlib.Path().glob("./ETL/paper*.pdf")]
indexing_pipeline.run(file_paths = files_to_index)

In [None]:
document_store.update_embeddings(retriever)

In [None]:
converter = PDFToTextConverter(
    remove_numeric_tables=True,
    valid_languages=["pt"]
)

In [None]:
from haystack.nodes import PDFToTextConverter
import langdetect

converter = PDFToTextConverter(
    remove_numeric_tables=True,
    valid_languages=["pt"]
)
docs = converter.convert(file_path="./ETL/paper1.pdf", meta=None)

In [None]:
from haystack.nodes import PreProcessor

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="sentence",
    split_respect_sentence_boundary=False)

docs_default = preprocessor.process(docs)

In [None]:
docs_default

In [None]:
sentences = []
for doc in docs_default:
    sentences.append(doc["content"].split(". "))

In [None]:

print(f"n_files_input: {len(docs_default)}\nn_docs_output: {len(docs_default)}")

### Verificar disponibilidade de placa gráfica

In [None]:
torch.cuda.is_available()

In [None]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

## Modelagem

### Codificação

In [None]:
# Codifique as sentenças usando o modelo
embeddings = model.encode(sentences, show_progress_bar=True, convert_to_tensor=True, device=device)

### Teste

In [None]:
new_sentence = "A amazôniza azul não possui extensão de 30000 km^2"
new_embedding = model.encode([new_sentence], convert_to_tensor=True, device=device)

In [None]:
np_embeddings = embeddings.cpu().numpy()
np_input = new_embedding.cpu().numpy()

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Calcule a similaridade de cosseno entre a nova sentença e as sentenças existentes
similarities = util.pytorch_cos_sim(new_embedding, embeddings)[0]

# Encontre os índices das sentenças mais similares (N sentenças)
N = 5
most_similar_indices = (-similarities).argsort()[:N]

In [None]:
for idx in most_similar_indices:
    print(sentences[idx])

In [None]:
#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")