# Step 0: Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import this to stop opensearch-py-ml from yelling every time a DataFrame connection made
import warnings
warnings.filterwarnings('ignore')

In [3]:
# imports to demonstrate DataFrame support
import pandas as pd
import numpy as np
import opensearch_py_ml as oml
from opensearchpy import OpenSearch

In [4]:
# imports to demonstrate SageMaker integration
import sagemaker
from sagemaker import RealTimePredictor
from opensearchpy import helpers
import boto3
import json

In [5]:
# imports to demonstrate ml-commons integration
from opensearch_py_ml.ml_commons_integration import MLCommonClient

# Step 1: Setup clients 

In [6]:
CLUSTER_URL = 'http://localhost:9200'

In [7]:
def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client

In [8]:
client = get_os_client()
ml_client = MLCommonClient(client)

to do SageMaker integration, make sure your boto3 Session has the appropriate credentials.

In [9]:
sess = boto3.Session()
smclient = sess.client('sagemaker')

In [10]:
smclient.list_endpoints()

{'Endpoints': [{'EndpointName': 'jumpstart-dft-lgb-classification-model',
   'EndpointArn': 'arn:aws:sagemaker:us-west-2:793040377150:endpoint/jumpstart-dft-lgb-classification-model',
   'CreationTime': datetime.datetime(2022, 7, 15, 11, 43, 50, 310000, tzinfo=tzlocal()),
   'LastModifiedTime': datetime.datetime(2022, 7, 15, 11, 46, 31, 115000, tzinfo=tzlocal()),
   'EndpointStatus': 'InService'}],
 'ResponseMetadata': {'RequestId': '79410960-d4a3-45e6-b00b-5d147511131a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '79410960-d4a3-45e6-b00b-5d147511131a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '274',
   'date': 'Fri, 09 Sep 2022 22:17:31 GMT'},
  'RetryAttempts': 0}}

# Step 2: Showcase DataFrame support

opensearch-py-ml Dataframes function similarly to pandas Dataframes, but reside entirely in the OpenSearch cluster and are refreshed with each call.

In [11]:
ecommerce_oml = oml.DataFrame(client, 'opensearch_dashboards_sample_data_ecommerce')

In [12]:
ecommerce_oml.head()

Unnamed: 0,category,currency,customer_birth_date,customer_first_name,customer_full_name,customer_gender,customer_id,customer_last_name,customer_phone,day_of_week,...,products.taxful_price,products.taxless_price,products.unit_discount_amount,sku,taxful_total_price,taxless_total_price,total_quantity,total_unique_products,type,user
0,[Men's Clothing],EUR,NaT,Eddie,Eddie Underwood,MALE,38,Underwood,,Monday,...,"[11.99, 24.99]","[11.99, 24.99]","[0, 0]","[ZO0549605496, ZO0299602996]",36.98,36.98,2,2,order,eddie
1,[Women's Clothing],EUR,NaT,Mary,Mary Bailey,FEMALE,20,Bailey,,Sunday,...,"[24.99, 28.99]","[24.99, 28.99]","[0, 0]","[ZO0489604896, ZO0185501855]",53.98,53.98,2,2,order,mary
2,"[Women's Shoes, Women's Clothing]",EUR,NaT,Gwen,Gwen Butler,FEMALE,26,Butler,,Sunday,...,"[99.99, 99.99]","[99.99, 99.99]","[0, 0]","[ZO0374603746, ZO0272202722]",199.98,199.98,2,2,order,gwen
3,"[Women's Shoes, Women's Clothing]",EUR,NaT,Diane,Diane Chandler,FEMALE,22,Chandler,,Sunday,...,"[74.99, 99.99]","[74.99, 99.99]","[0, 0]","[ZO0360303603, ZO0272002720]",174.98,174.98,2,2,order,diane
4,"[Men's Clothing, Men's Accessories]",EUR,NaT,Eddie,Eddie Weber,MALE,38,Weber,,Monday,...,"[59.99, 20.99]","[59.99, 20.99]","[0, 0]","[ZO0542505425, ZO0601306013]",80.98,80.98,2,2,order,eddie


Just like in pandas, the output of a DataFrame method is another DataFrame, allowing for methods to be chained:

In [13]:
ecommerce_oml.filter(regex='produ.', axis=1)[['products.base_price', 'products.category']].head()

Unnamed: 0,products.base_price,products.category
0,"[11.99, 24.99]","[Men's Clothing, Men's Clothing]"
1,"[24.99, 28.99]","[Women's Clothing, Women's Clothing]"
2,"[99.99, 99.99]","[Women's Shoes, Women's Clothing]"
3,"[74.99, 99.99]","[Women's Shoes, Women's Clothing]"
4,"[59.99, 20.99]","[Men's Clothing, Men's Accessories]"


In [14]:
ecommerce_oml.groupby(['day_of_week', 'day_of_week_i', 'type']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,products.base_price,products.base_unit_price,products.discount_amount,products.discount_percentage,products.min_price,products.price,products.product_id,products.quantity,products.tax_amount,products.taxful_price,products.taxless_price,products.unit_discount_amount,taxful_total_price,taxless_total_price,total_quantity,total_unique_products
day_of_week,day_of_week_i,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Friday,4,order,34.665464,34.665464,0.074866,0.118835,17.296444,34.590598,16526.074272,1.0,0.0,34.590598,34.590598,0.074866,75.600248,75.600248,2.185714,2.185714
Monday,0,order,35.963039,35.073007,0.037975,0.063291,17.419775,35.925064,16824.3125,1.003165,0.0,35.925064,35.925064,0.037975,78.421901,78.421901,2.189983,2.183074
Saturday,5,order,34.357962,34.357962,0.064331,0.101911,17.137746,34.293631,16746.33949,1.0,0.0,34.293631,34.293631,0.064331,73.148506,73.148506,2.133152,2.133152
Sunday,6,order,35.278721,35.278721,0.144828,0.229885,17.529815,35.133893,16823.595402,1.0,0.0,35.133893,35.133893,0.144828,74.669203,74.669203,2.125407,2.125407
Thursday,3,order,34.680409,34.680409,0.127316,0.203228,17.300117,34.553093,16895.474597,1.0,0.0,34.553093,34.553093,0.127316,74.584878,74.584878,2.15871,2.15871
Tuesday,1,order,34.335716,34.335716,0.151492,0.244836,17.174665,34.184224,16484.140015,1.0,0.0,34.184224,34.184224,0.151492,73.359492,73.359492,2.146141,2.146141
Wednesday,2,order,35.193203,35.193203,0.111284,0.171206,17.576341,35.081919,16470.031907,1.0,0.0,35.081919,35.081919,0.111284,76.143869,76.143869,2.170608,2.170608


unlike pandas DataFrames, opensearch-py-ml does not allow for assignment in DataFrames (since the data itself is being pulled from OpenSearch).

In [15]:
ecommerce_oml['taxless_total_price'] = ecommerce_oml['taxless_total_price'] * 1.15

TypeError: 'DataFrame' object does not support item assignment

In [16]:
ecommerce_pd = ecommerce_oml.head().to_pandas()

In [17]:
ecommerce_pd['taxless_total_price'] = ecommerce_pd['taxless_total_price'] * 1.15

In [18]:
ecommerce_pd['taxless_total_price']

0     42.527
1     62.077
2    229.977
3    201.227
4     93.127
Name: taxless_total_price, dtype: float64

# Step 3: Showcase SageMaker Support

We'll showcase SageMaker integration by uploading an `opensearch-py-ml` DataFrame to a deployed SageMaker endpoint. The particular problem we'll try to solve is the classic MNIST digit classification problem, where a 28x28 grayscale image is sent to our ML algorithm, and the algorithm spits out a 10-long vector representing the probability the image represents a certain digit (0-9).

![title](mnist-3.0.1.png)

The model we've deployed is a LightGBM model for digit classification: https://lightgbm.readthedocs.io/en/v3.3.2/

In [19]:
# step 1: preprocess the data we'd like to send to endpoint in pandas
newline, bold, unbold = '\n', '\033[1m', '\033[0m'

# read the data
test_data = pd.read_csv('truncated_data.csv', index_col='Unnamed: 0')
test_data.columns = ['Target'] + [f"Feature_{i}" for i in range(1, test_data.shape[1])]

num_examples, num_columns = test_data.shape
print(f"{bold}The test dataset contains {num_examples} examples and {num_columns} columns.{unbold}\n")

# prepare the ground truth target and predicting features to send into the endpoint.
ground_truth_label, features = test_data.iloc[:, :1], test_data.iloc[:, 1:]

print(f"{bold}The first 5 observations of the data: {unbold} \n")
test_data.head(5)

[1mThe test dataset contains 100 examples and 785 columns.[0m

[1mThe first 5 observations of the data: [0m 



Unnamed: 0,Target,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_775,Feature_776,Feature_777,Feature_778,Feature_779,Feature_780,Feature_781,Feature_782,Feature_783,Feature_784
0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


uploading pandas DataFrames as OpenSearch indices are as simple as a one-line call:

In [20]:
oml.pandas_to_opensearch(test_data, client, 'sagemaker_demo_data')

Unnamed: 0,Feature_1,Feature_10,Feature_100,Feature_101,Feature_102,Feature_103,Feature_104,Feature_105,Feature_106,Feature_107,...,Feature_91,Feature_92,Feature_93,Feature_94,Feature_95,Feature_96,Feature_97,Feature_98,Feature_99,Target
0,0.0,0.0,0.585938,0.363281,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.453125,0.488281,0.667969,0.996094,0.996094,2.0
1,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
2,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,4.0
4,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.0
96,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,7.0
97,0.0,0.0,0.082031,0.613281,0.941406,0.964844,0.507812,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,6.0
98,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,9.0


In [21]:
oml_data = oml.DataFrame(client, 'sagemaker_demo_data')

In [22]:
oml_data.head()

Unnamed: 0,Feature_1,Feature_10,Feature_100,Feature_101,Feature_102,Feature_103,Feature_104,Feature_105,Feature_106,Feature_107,...,Feature_91,Feature_92,Feature_93,Feature_94,Feature_95,Feature_96,Feature_97,Feature_98,Feature_99,Target
0,0.0,0.0,0.585938,0.363281,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.453125,0.488281,0.667969,0.996094,0.996094,2.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
oml_data.columns

Index(['Feature_1', 'Feature_10', 'Feature_100', 'Feature_101', 'Feature_102',
       'Feature_103', 'Feature_104', 'Feature_105', 'Feature_106',
       'Feature_107',
       ...
       'Feature_91', 'Feature_92', 'Feature_93', 'Feature_94', 'Feature_95',
       'Feature_96', 'Feature_97', 'Feature_98', 'Feature_99', 'Target'],
      dtype='object', length=785)

note that because individual fields in OpenSearch are saved as mappings, which have no explicit order, we need to specify a column order when uploading data to our SageMaker endpoint.

In [23]:
indices, preds = oml.make_sagemaker_prediction('jumpstart-dft-lgb-classification-model', 
                                     oml_data,
                                     'Target',
                                     column_order = list(test_data.drop(columns='Target').columns),
                                     chunksize=1000)

In [24]:
preds = [json.loads(item)['probabilities'] for item in preds]
preds = np.concatenate([np.array(item) for item in preds])

In [25]:
preds

array([[2.23936363e-06, 2.72204307e-06, 9.99956481e-01, 2.12872813e-05,
        8.25802174e-08, 1.36823121e-06, 1.22973048e-05, 2.43437753e-08,
        3.37040991e-06, 1.27667433e-07],
       [5.41093269e-09, 9.99998665e-01, 3.88315741e-07, 1.67135588e-07,
        1.75890656e-07, 7.01735013e-08, 2.66019821e-08, 2.16850322e-07,
        2.30254270e-07, 5.38853538e-08],
       [9.99997541e-01, 2.06131264e-08, 6.19858350e-07, 1.77407655e-07,
        7.43340552e-08, 5.61539208e-07, 3.28389631e-07, 2.29036267e-07,
        5.61493849e-08, 3.91313511e-07],
       [1.45008252e-07, 2.87245638e-08, 6.37037452e-07, 3.12080418e-07,
        9.99978222e-01, 1.74717099e-07, 1.20819731e-07, 9.76280506e-07,
        8.48253532e-07, 1.85346685e-05],
       [1.99674188e-09, 9.99998727e-01, 5.45260511e-08, 2.65439091e-08,
        5.78441902e-08, 7.42205684e-09, 3.07262386e-09, 1.00344465e-06,
        9.54110614e-08, 2.24618835e-08],
       [9.33716612e-08, 5.19932107e-07, 1.41894822e-06, 4.58352665e-06,
   

In [26]:
# take most likely digit for each test input
preds = np.argmax(preds, axis=1)

# return in order of the indices
s = pd.Series(preds, indices)
s.index = [int(item) for item in s.index]
np.array(s.sort_index()[:100].values)

array([2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4, 9, 6, 6,
       5, 4, 0, 7, 4, 0, 1, 3, 1, 3, 4, 7, 2, 7, 1, 2, 1, 1, 7, 4, 2, 3,
       5, 1, 2, 4, 4, 6, 3, 5, 5, 6, 0, 4, 1, 9, 5, 7, 8, 9, 3, 7, 4, 6,
       4, 3, 0, 7, 0, 2, 9, 1, 7, 3, 2, 9, 7, 7, 6, 2, 7, 8, 4, 7, 3, 6,
       1, 3, 6, 9, 3, 1, 4, 1, 7, 6, 9, 6])

# Step 4: Showcase ml-commons Support

all ML-related functionality in `opensearch-py-ml` is stored in a separate client:

In [27]:
ml_client = MLCommonClient(client)

In [28]:
ml_client.put_model("sentence_transformer3.zip", "sentence_transformer", 1, verbose=True)

uploading chunk 1 of 47
uploading chunk 2 of 47
uploading chunk 3 of 47
uploading chunk 4 of 47
uploading chunk 5 of 47
uploading chunk 6 of 47
uploading chunk 7 of 47
uploading chunk 8 of 47
uploading chunk 9 of 47
uploading chunk 10 of 47
uploading chunk 11 of 47
uploading chunk 12 of 47
uploading chunk 13 of 47
uploading chunk 14 of 47
uploading chunk 15 of 47
uploading chunk 16 of 47
uploading chunk 17 of 47
uploading chunk 18 of 47
uploading chunk 19 of 47
uploading chunk 20 of 47
uploading chunk 21 of 47
uploading chunk 22 of 47
uploading chunk 23 of 47
uploading chunk 24 of 47
uploading chunk 25 of 47
uploading chunk 26 of 47
uploading chunk 27 of 47
uploading chunk 28 of 47
uploading chunk 29 of 47
uploading chunk 30 of 47
uploading chunk 31 of 47
uploading chunk 32 of 47
uploading chunk 33 of 47
uploading chunk 34 of 47
uploading chunk 35 of 47
uploading chunk 36 of 47
uploading chunk 37 of 47
uploading chunk 38 of 47
uploading chunk 39 of 47
uploading chunk 40 of 47
uploading