In [None]:
!pip install --upgrade numpy --target ./python
!pip install --upgrade numexpr --target ./python
!pip install --upgrade nltk --target ./python
!pip install --upgrade Pydantic==1.10.7 --target ./python

In [None]:
import sys
sys.path.append(r"./python")

import os
import json
import traceback
import urllib.parse
import boto3
from datetime import datetime
import time
from python.langchain.vectorstores import OpenSearchVectorSearch
from python.langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from python.langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
from typing import Dict, List, Tuple

In [None]:
#根据时间情况修改index和language值

index =  ""
language = "english"
endpoint_name = "huggingface-inference-eb-minilm"

port = 443
bulk_size = 10000000

sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-host-url')['SecretString']
data= json.loads(master_user)
es_host_name = data.get('host')
host = es_host_name+'/' if es_host_name[-1] != '/' else es_host_name# cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
host = host[8:-1]
region = boto3.Session().region_name # e.g. cn-north-1
print('host:',host)
print('region:',region)

# retrieve secret manager value by key using boto3                                             
sm_client = boto3.client('secretsmanager')
master_user = sm_client.get_secret_value(SecretId='opensearch-master-user')['SecretString']
data= json.loads(master_user)
username = data.get('username')
password = data.get('password')

In [None]:

def init_vector_store(embeddings,
             index_name,
             opensearch_host,
             opensearch_port,
             opensearch_user_name,
             opensearch_user_password):

    vector_store = OpenSearchVectorSearch(
        index_name=index_name,
        embedding_function=embeddings, 
        opensearch_url="aws-opensearch-url",
        hosts = [{'host': opensearch_host, 'port': opensearch_port}],
        http_auth = (opensearch_user_name, opensearch_user_password),
    )
    return vector_store

def init_embeddings(endpoint_name,region_name,language: str = "chinese"):
    
    class ContentHandler(EmbeddingsContentHandler):
        content_type = "application/json"
        accepts = "application/json"

        def transform_input(self, inputs: List[str], model_kwargs: Dict) -> bytes:
            input_str = json.dumps({"inputs": inputs, **model_kwargs})
            return input_str.encode('utf-8')

        def transform_output(self, output: bytes) -> List[List[float]]:
            response_json = json.loads(output.read().decode("utf-8"))
            return response_json


    content_handler = ContentHandler()

    embeddings = SagemakerEndpointEmbeddings(
        endpoint_name=endpoint_name, 
        region_name=region_name, 
        content_handler=content_handler
    )
    return embeddings

embeddings = init_embeddings(endpoint_name,region,language= "english")
vector_store=init_vector_store(embeddings,index,host,port,username,password)

In [None]:
import yaml
product_num = 0
with open('./products.yaml') as file:
    products_list = yaml.safe_load(file)
    product_info_list = []
    metadatas = []
    for product in products_list:
#         print(product)
        product_info ='product '
        metadata = {}
        
        if 'id' in product.keys():
            metadata['id'] = product['id']
        else:
            metadata['id'] = 'None'
        
        if 'name' in product.keys():
            product_info += ('name:'+product['name'])
            metadata['name'] = product['name']
        else:
            metadata['name'] = 'None'
        
        
        if 'description' in product.keys():
            product_info += ('; description:'+product['description'])
            metadata['description'] = product['description']
        else:
            metadata['description'] = 'None'
            
        if 'category' in product.keys():
            product_info += ('; category:'+product['category'])
            metadata['category'] = product['category']
        else:
            metadata['category'] = 'None'

        if 'style' in product.keys():
            product_info += ('; style:'+product['style'])    
            metadata['style'] = product['style']
        else:
            metadata['style'] = 'None'
            
        if 'price' in product.keys():
            product_info += ('; price:'+str(product['price']))
            metadata['price'] = product['price']
        else:
            metadata['price'] = -1
            
        if 'gender_affinity' in product.keys():
            gender_affinity = 'female' if product['gender_affinity'] == 'F' else 'male'
            product_info += ('; gender affinity:'+gender_affinity)
            metadata['gender_affinity'] = product['gender_affinity']
        else:
            metadata['gender_affinity'] = 'None'    
         
        if 'current_stock' in product.keys():
            metadata['current_stock'] = product['current_stock']
        else:
            metadata['current_stock'] = -1
            
        if 'image' in product.keys():
            metadata['image'] = product['image']
        else:
            metadata['image'] = 'None' 
        
        metadatas.append(metadata)
            
        product_info_list.append(product_info)
        product_num += 1
        print('product_num:',product_num)
        print('product_info:',product_info)
    
    print('begin to save in vectore store')
    vector_store.add_texts(
        texts=product_info_list,
        metadatas=metadatas,
        bulk_size=10000,
        batch_size=100
    )
    print('finish save in vectore store')