In [1]:
import iris
import os
import sentence_transformers
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
import numpy as np

import torch
from transformers import pipeline

  from tqdm.autonotebook import tqdm, trange


In [2]:
connection_string = "iris:1972/LLMRAG"
username = "superuser"
password = "SYS"

connectionIRIS = iris.connect(connection_string, username, password)
cursorIRIS = connectionIRIS.cursor()
print("Connected")


Connected


In [3]:
qa_model = pipeline("question-answering", "timpal0l/mdeberta-v3-base-squad2")

In [4]:
if not os.path.isdir('/app/data/model/'):
    modelEmbedding = sentence_transformers.SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')            
    modelEmbedding.save('/app/data/model/')

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
path = "/app/data"
loader = PyPDFDirectoryLoader(path)
docs_before_split = loader.load()
docs_after_split = text_splitter.split_documents(docs_before_split)
modelEmbedding = sentence_transformers.SentenceTransformer("/app/data/model/")
for doc in docs_after_split:
    embeddings = modelEmbedding.encode(doc.page_content, normalize_embeddings=True)
    array = np.array(embeddings)
    formatted_array = np.vectorize('{:.12f}'.format)(array)
    parameters = []
    parameters.append(doc.metadata['source'])
    parameters.append(str(doc.page_content))
    parameters.append(str(','.join(formatted_array)))
    cursorIRIS.execute("INSERT INTO LLMRAG.DOCUMENTCHUNK (Document, Phrase, VectorizedPhrase) VALUES (?, ?, TO_VECTOR(?,DECIMAL))", parameters)
connectionIRIS.commit()


In [6]:
literalQuestion = "¿Qué medicamento puede tomar mi hijo de 2 años para bajar la fiebre?"
question = modelEmbedding.encode(literalQuestion, normalize_embeddings=True)
array = np.array(question)
formatted_array = np.vectorize('{:.12f}'.format)(array)
parameterQuery = []
parameterQuery.append(str(','.join(formatted_array)))
cursorIRIS.execute("SELECT distinct(Document) FROM (SELECT VECTOR_DOT_PRODUCT(VectorizedPhrase, TO_VECTOR(?, DECIMAL)) AS similarity, Document FROM LLMRAG.DOCUMENTCHUNK) WHERE similarity > 0.6", parameterQuery)
similarityRows = cursorIRIS.fetchall()

In [9]:
context = ''
for similarityRow in similarityRows:
    for doc in docs_before_split:
        if similarityRow[0] == doc.metadata['source'].upper():
            context = context +"".join(doc.page_content)

qa_model(question = "¿Cómo se llama el medicamento infantil para la fiebre?", context = context)


Hay resultados


{'score': 0.1608656644821167,
 'start': 20154,
 'end': 20160,
 'answer': '\nDalsy'}

In [10]:
connectionIRIS.close()
