In [114]:
import cohere
import pinecone
import openai

In [None]:
class EmbeddingGenerator:
    def __init__(self):
        pass

class DocumentIndexer:
    def __init__(self):
        

In [42]:
from voyager import Index, Space

In [None]:
co = cohere.Client(COHERE_API)

In [364]:
pinecone.create_index('document-indexer', dimension=1024, metadata_config={"indexed": ["user_id", "document_id"]})

In [365]:
import tika
tika.initVM()
from tika import parser

In [380]:
from pypdf import PdfReader 
from nltk.tokenize import sent_tokenize

def format_line(line: str) -> str:
    return line.replace('\n', '')

def paragraph_splitter(lines: str, max_char: int = 4000) -> list[str]:
    paragraphs = []
    paragraph = ''
    for line in lines:
        paragraph += ' ' + line
        if len(paragraph) >= max_char:
            paragraphs.append(paragraph)
            paragraph = ''
    if len(paragraph) > 0:
        paragraphs.append(paragraph)
    return paragraphs
    

def get_text_from_pdf(filename: str) -> list[str]:
    lines = [format_line(line) for line in sent_tokenize(parser.from_file(filename)['content'])]
    return paragraph_splitter(lines)

def add_document(filename, user_id, document_id):
    texts = get_text_from_pdf(filename)
    print(len(texts))
    response = co.embed(
      texts=texts,
      model='embed-multilingual-v3.0',
      input_type='search_document'
    )
    index_map[(user_id, document_id)] = index = Index(Space.Cosine, num_dimensions=1024)
    upsert_response = index.upsert(
        vectors=[
            (
             f"{user_id}_{document_id}_{i}",
             e ,  # Dense vector values
            {"user_id": user_id, "document_id": document_id, "sentence_id": i, "text": text}
            )
            for i, (text, e) in enumerate(zip(texts, response.embeddings))
        ]
    )

In [405]:
def get_answer(question, user_id, document_id):
    index = pinecone.Index("document-indexer")
    embedding = co.embed(
          texts=[question],
          model='embed-multilingual-v3.0',
          input_type='search_query'
        ).embeddings[0]
    query_response = index.query(
        top_k=3,
        vector=embedding,
        filter={
            "user_id": {"$eq": user_id},
            "document_id": {"$eq": document_id},
        },
        include_metadata=True
    )
    answers = [q['metadata']['text'] for q in query_response['matches'] if q['score']]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[
            {"role": "system", "content": 
                 f"""
    The answer to this question "{question}" are in the following sentences generate just the answer plus the reasoning of the answer in one line. Answer the question in the language of the question. If there are no answer just output the word "None"
    {str(answers)}
                 """.strip()}
        ],
        temperature = 0
    )
    answer = response.choices[0].message.content
    print(f"Price: {response.usage.total_tokens * 0.003 / 1000} $")
    reasoning = None
    if answer != 'None':
        reasoning = answers[0]
    return answer

In [383]:
add_document('Resume.pdf', 'hoang', 'resume')

1


In [384]:
add_document('transformer.pdf', 'hoang', 'transformer')

10


In [385]:
add_document('mlviet.pdf', 'hoang', 'vietnam')

5


In [386]:
add_document('giaithuat.pdf', 'hoang', 'vietgiaithuat')

129


In [411]:
get_answer('Why Transformer is good for translation', 'hoang', 'transformer')

Price: 0.008715 $


'The Transformer is good for translation because it replaces recurrent layers with multi-headed self-attention, allowing for faster training and achieving state-of-the-art results.'

In [318]:
len(get_text_from_pdf('transformer.pdf'))

15