In [None]:
!pip install --upgrade numpy --target ./python
!pip install --upgrade numexpr --target ./python
!pip install --upgrade nltk --target ./python
!pip install --upgrade Pydantic==1.10.7 --target ./python

In [None]:
import sys
sys.path.append(r"./python")

import os
import json
import traceback
import urllib.parse
import boto3
from datetime import datetime
import time
from python.langchain.vectorstores import OpenSearchVectorSearch
from python.langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from python.langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
from typing import Dict, List, Tuple

In [None]:
#根据时间情况修改index和language值

index =  ""
language = "chinese"
endpoint_name = "huggingface-inference-eb"

port = 443
bulk_size = 10000000

sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-host-url')['SecretString']
data= json.loads(master_user)
es_host_name = data.get('host')
host = es_host_name+'/' if es_host_name[-1] != '/' else es_host_name# cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
host = host[8:-1]
region = boto3.Session().region_name # e.g. cn-north-1
print('host:',host)
print('region:',region)

# retrieve secret manager value by key using boto3                                             
sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-master-user')['SecretString']
data= json.loads(master_user)
username = data.get('username')
password = data.get('password')

In [None]:

def init_vector_store(embeddings,
             index_name,
             opensearch_host,
             opensearch_port,
             opensearch_user_name,
             opensearch_user_password):

    vector_store = OpenSearchVectorSearch(
        index_name=index_name,
        embedding_function=embeddings, 
        opensearch_url="aws-opensearch-url",
        hosts = [{'host': opensearch_host, 'port': opensearch_port}],
        http_auth = (opensearch_user_name, opensearch_user_password),
    )
    return vector_store

def init_embeddings(endpoint_name,region_name,language: str = "chinese"):
    
    class ContentHandler(EmbeddingsContentHandler):
        content_type = "application/json"
        accepts = "application/json"

        def transform_input(self, inputs: List[str], model_kwargs: Dict) -> bytes:
            input_str = json.dumps({"inputs": inputs, **model_kwargs})
            return input_str.encode('utf-8')

        def transform_output(self, output: bytes) -> List[List[float]]:
            response_json = json.loads(output.read().decode("utf-8"))
            return response_json


    content_handler = ContentHandler()

    embeddings = SagemakerEndpointEmbeddings(
        endpoint_name=endpoint_name, 
        region_name=region_name, 
        content_handler=content_handler
    )
    return embeddings

embeddings = init_embeddings(endpoint_name,region,language= language)
vector_store=init_vector_store(embeddings,index,host,port,username,password)

In [None]:
import csv
local_file = "./hotel.csv"

csvfile=open(local_file,mode='r',encoding='utf-8')
reader = [each for each in csv.DictReader(csvfile, delimiter=',')]

product_list = []
metadatas = []
i = 0
for line in reader:
    # print('line:',line)
    product_info = ''
    metadata = {}
    
    metadata['城市'] = line['\ufeff城市']
    metadata['地区'] = line['地区']
    metadata['酒店名'] = line['酒店名']
    metadata['星级'] = line['星级']
    metadata['地铁站'] = line['地铁站']
    metadata['会场名称'] = line['会场名称']
    metadata['最大容纳人数'] = float(line['最大容纳人数'])
    metadata['会场全天价格'] = float(line[' 会场全天价格(平均) '])
    metadata['会场半天价格'] = float(line['会场半天价格(平均)'])
    metadatas.append(metadata)
    product_info += ('城市:'+line['\ufeff城市']+',')
    product_info += ('酒店名:'+line['酒店名']+',')
    product_info += ('星级:'+line['星级']+',')
    product_info += ('地区:'+line['地区']+',')
    product_info += ('地铁站:'+line['地铁站']+',')
    product_info += ('会场名称:'+line['会场名称']+',')
    product_info += ('最大容纳人数:'+line['最大容纳人数']+',')
    product_info += ('会场全天价格:'+line[' 会场全天价格(平均) ']+',')
    product_info += ('会场半天价格:'+line['会场半天价格(平均)']+',')
    product_info += ('围餐:'+line['围餐']+',')
    product_info += ('自助:'+line['自助']+',')
    product_info += ('常办活动类型:'+line['常办活动类型'])
    product_list.append(product_info)
    
    i += 1
    if i % 40 == 0:
        print('begin to save in vectore store')
        vector_store.add_texts(
            texts=product_list,
            metadatas=metadatas,
            bulk_size=10000,
            batch_size=100
        )
        print('finish save in vectore store')
        product_list = []
        metadatas = []
        
    
print('begin to save in vectore store')
vector_store.add_texts(
    texts=product_list,
    metadatas=metadatas,
    bulk_size=10000,
    batch_size=100
)
print('finish save in vectore store')