### 02 Vector Database에 특정 키워드로 유사도 검색 후, 유사도 내림차순으로 종목명을 정렬합니다.
- 유사도: cosine similarity
- vector database(ChromaDB) API: https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.chroma.Chroma.html

#### 02.1 vectordb 불러오기

In [10]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

import os, json

def get_vectordb():
    with open('conf.json', 'r') as f:
        json_data = json.load(f)

    os.environ['OPENAI_API_KEY'] = json_data['API_KEY']

    persist_directory='db'

    embedding = OpenAIEmbeddings(
        model='text-embedding-ada-002'
    )

    vectordb = Chroma( # 기존 벡터 DB 로드
        persist_directory=persist_directory,
        embedding_function=embedding
    )
    
    return vectordb

vectordb = get_vectordb()

#### 02.2 특정 지수의 구성종목 불러오기
- 필터링 후 유사도 검색 하기 위함입니다.

In [11]:
from data.fetch_data import fetch_pdf_info

import pandas as pd

def get_filter(etf_tkr='AIQ'):

    pdf = fetch_pdf_info(etf_tkr=etf_tkr)

    pdf_df = pd.DataFrame(pdf)

    filter_list = []

    for i, row in pdf_df.iterrows():

        my_dict = {}

        symbol = row.child_stk_tkr

        my_dict['symbol'] = symbol

        filter_list.append(my_dict)
        
    return filter_list

filter_list = get_filter(etf_tkr="AIQ")

In [12]:
def get_similar_symbols(*args, **kwargs):
    
    vectordb = kwargs.pop('vectordb')
    keyword = kwargs.pop('keyword')
    filter_list = kwargs.pop('filter_list')
    k = kwargs.pop('k', 5)

    docs = vectordb.similarity_search_with_score(keyword, k=k, filter={
        '$or': filter_list
        })
    
    symbol_list = []

    for doc in docs:
        symbol_list.append(doc[0].metadata['symbol'])
        
    return symbol_list

keyword = "Platform as a Service (PaaS) Companies that are involved in providing a platform for creating software applications which are delivered over the internet."

get_similar_symbols(keyword=keyword, k=10)

['PEGA',
 'NOW',
 'ORCL',
 'IBM',
 'ZS',
 'MSFT',
 'META',
 'TWLO',
 'HPE',
 'ADBE',
 'GOOGL',
 'WDAY',
 'ACN',
 'AMZN',
 'CRM',
 'INFA',
 'SHOP',
 'WIX',
 'PATH',
 'DDOG']