# read docs

In [None]:
# 1. 문서 읽기

In [None]:
%pip install python-docx

In [None]:
from docx import Document

document = Document('./tax.docx')
print(f'document: {dir(document)}')
full_text = ''
for idx, paragraph in enumerate(document.paragraphs):
    full_text += f'{paragraph.text}\n'

In [None]:
# 2. 문서 쪼개기

In [None]:
%pip install tiktoken

In [None]:
import tiktoken

def split_text(full_text, chunk_size):
    encoder = tiktoken.encoding_for_model('gpt-4o')
    total_encoding = encoder.encode(full_text)
    total_token_count = len(total_encoding)
    text_list = []
    
    for i in range(0, total_token_count, chunk_size):
        chunk = total_encoding[i: i + chunk_size]
        decoded = encoder.decode(chunk)
        text_list.append(decoded)
        
    return text_list
    

In [None]:
chunk_list = split_text(full_text, 1500)
len(chunk_list)

In [None]:
# 3. 임베딩

In [None]:
%pip install chromadb

In [None]:
import chromadb

chroma_client = chromadb.Client()

In [None]:
collection_name = 'tax_collection'
tax_collection = chroma_client.create_collection(collection_name)

In [None]:
import os
from dotenv import load_dotenv
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
openai_embedding = OpenAIEmbeddingFunction(model_name='text-embedding-3-large', api_key=openai_api_key)

In [None]:
tax_collection = chroma_client.get_or_create_collection(collection_name, embedding_function=openai_embedding)

In [None]:
ids = []
for idx in range(len(chunk_list)):
    ids.append(f'{idx}')

In [None]:
len(ids)

In [None]:
tax_collection.add(documents=chunk_list, ids=ids)

In [None]:
# 4. 유사도 검색

In [None]:
query = '연봉 5천만원인 직장인의 소득세는 얼마인가요?'
retrieved_doc = tax_collection.query(query_texts=query, n_results=3)

In [None]:
retrieved_doc['documents']

In [None]:
# 5. 질의 (https://platform.openai.com/docs/guides/chat-completions)

In [None]:
from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "당신은 한국의 소득세 전문가입니다. 아래 내용을 참고해서 질문에 답변해주세요"},
        {"role": "user", "content": query}
    ]
)

In [None]:
response.choices[0].message.content