### 01 기업 비즈니스 개요를 Vector DataBase에 저장한다.
- 기업 비즈니스 개요: yfinance에서 추출한다.
- Vector Database: ChromaDB를 사용한다. 
- Metadata: where문 사용을 위해 부모 ETF를 넣어주어야 한다.
- Database: 별도 DB 서버 X. 호스트 자체 저장장치에 저장한다.

In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader


import os, json

with open('conf.json', 'r') as f:
    json_data = json.load(f)
    
os.environ['OPENAI_API_KEY'] = json_data['API_KEY']

In [2]:
from data.fetch_data import fetch_data_from_db
import pandas as pd

query = """
    SELECT pdf.etf_tkr, pdf.child_stk_tkr
        FROM os_pdf_info pdf
        INNER JOIN os_stk_info stk
        ON pdf.child_stk_tkr=stk.stk_tkr
"""

ticker_df = pd.DataFrame(fetch_data_from_db(query=query))

ticker_list = list(set(ticker_df['child_stk_tkr'].to_list()))

In [3]:
import yfinance as yf
from tqdm.auto import tqdm

yf_stk_info = yf.Tickers(" ".join(ticker_list))

for t in tqdm(ticker_list):
    
    if os.path.isfile(f"./stk_infos/{t}.json"):
        continue
    
    parent_etfs = ticker_df[ticker_df['child_stk_tkr']==t]['etf_tkr'].to_list()
    
    yf_stk_info.tickers[t].info['parent_etfs'] = parent_etfs
    
    with open(f'./stk_infos/{t}.json', 'w') as f:
        json.dump(yf_stk_info.tickers[t].info, f, indent=4)

  0%|          | 0/712 [00:00<?, ?it/s]

In [7]:
from langchain.document_loaders import JSONLoader

def metadata_func(record: dict, metadata: dict) -> dict:
        
    for k, v in record.items():
        if k=='longBusinessSummary':
            continue
        
        if k == 'parent_etfs':
            metadata[k] = ",".join(record.get(k))
        
        if type(record.get(k)) not in [str, int, float]:
            continue
        
        metadata[k] = record.get(k)

    return metadata

loader = DirectoryLoader('./stk_infos',
                         glob='*.json', 
                         loader_cls=JSONLoader, 
                         loader_kwargs={'jq_schema': '.', 'content_key': 'longBusinessSummary', 'metadata_func': metadata_func})

documents = loader.load()

len(documents)

712

In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200) #1000자 씩 끊되, 200자 씩 겹치게 만든다.
texts = text_splitter.split_documents(documents)

display(len(texts))

712

In [11]:
persist_directory='db'

embedding = OpenAIEmbeddings(
    model='text-embedding-ada-002'
)

vectordb = Chroma.from_documents(
    documents=texts,
    embedding=embedding,
    persist_directory=persist_directory,
)

# vectordb.persist() # 초기화
# vectordb=None