# Step 0: Imports

In [16]:
# %load_ext autoreload
# %autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
# import this to stop opensearch-py-ml from yelling every time a DataFrame connection made
import warnings
warnings.filterwarnings('ignore')

In [2]:
# imports to demonstrate DataFrame support
import pandas as pd
import numpy as np
import opensearch_py_ml as oml
from opensearchpy import OpenSearch

In [30]:
# imports to demonstrate SageMaker integration
import sagemaker
from sagemaker import RealTimePredictor
from opensearchpy import helpers
import boto3
import json

from tests import (
    ECOMMERCE_FILE_NAME,
    ECOMMERCE_INDEX_NAME,
    ECOMMERCE_MAPPING,
    OPENSEARCH_HOST,
    OPENSEARCH_TEST_CLIENT,
    FLIGHTS_FILE_NAME,
    FLIGHTS_INDEX_NAME,
    FLIGHTS_MAPPING,
    FLIGHTS_SMALL_FILE_NAME,
    FLIGHTS_SMALL_INDEX_NAME,
    TEST_MAPPING1,
    TEST_MAPPING1_INDEX_NAME,
    TEST_NESTED_USER_GROUP_DOCS,
    TEST_NESTED_USER_GROUP_INDEX_NAME,
    TEST_NESTED_USER_GROUP_MAPPING,
)



In [4]:
# imports to demonstrate ml-commons integration
from opensearch_py_ml.ml_commons_integration import MLCommonClient

# Step 1: Setup clients 

In [15]:
CLUSTER_URL = 'https://localhost:9200'

In [16]:
def get_os_client(cluster_url = CLUSTER_URL,
                  username='admin',
                  password='admin'):
    '''
    Get OpenSearch client
    :param cluster_url: cluster URL like https://ml-te-netwo-1s12ba42br23v-ff1736fa7db98ff2.elb.us-west-2.amazonaws.com:443
    :return: OpenSearch client
    '''
    client = OpenSearch(
        hosts=[cluster_url],
        http_auth=(username, password),
        verify_certs=False
    )
    return client

In [17]:
client = get_os_client()
ml_client = MLCommonClient(client)

to do SageMaker integration, make sure your boto3 Session has the appropriate credentials.

In [18]:
sess = boto3.Session()
smclient = sess.client('sagemaker')

In [19]:
smclient.list_endpoints()

ClientError: An error occurred (ExpiredTokenException) when calling the ListEndpoints operation: The security token included in the request is expired

# Step 2: Showcase DataFrame support

opensearch-py-ml Dataframes function similarly to pandas Dataframes, but reside entirely in the OpenSearch cluster and are refreshed with each call.

In [28]:
ecommerce_oml = oml.DataFrame(client, 'opensearch_dashboards_sample_data_ecommerce')

In [21]:
ecommerce_oml.head()

Unnamed: 0,category,currency,customer_birth_date,customer_first_name,customer_full_name,customer_gender,customer_id,customer_last_name,customer_phone,day_of_week,...,products.taxful_price,products.taxless_price,products.unit_discount_amount,sku,taxful_total_price,taxless_total_price,total_quantity,total_unique_products,type,user
N9om64IBP2tbyBgXA2Qu,[Men's Clothing],EUR,NaT,Eddie,Eddie Underwood,MALE,38,Underwood,,Monday,...,"[11.99, 24.99]","[11.99, 24.99]","[0, 0]","[ZO0549605496, ZO0299602996]",36.98,36.98,2,2,order,eddie
ONom64IBP2tbyBgXA2Qu,[Women's Clothing],EUR,NaT,Mary,Mary Bailey,FEMALE,20,Bailey,,Sunday,...,"[24.99, 28.99]","[24.99, 28.99]","[0, 0]","[ZO0489604896, ZO0185501855]",53.98,53.98,2,2,order,mary
Odom64IBP2tbyBgXA2Qv,"[Women's Shoes, Women's Clothing]",EUR,NaT,Gwen,Gwen Butler,FEMALE,26,Butler,,Sunday,...,"[99.99, 99.99]","[99.99, 99.99]","[0, 0]","[ZO0374603746, ZO0272202722]",199.98,199.98,2,2,order,gwen
Otom64IBP2tbyBgXA2Qv,"[Women's Shoes, Women's Clothing]",EUR,NaT,Diane,Diane Chandler,FEMALE,22,Chandler,,Sunday,...,"[74.99, 99.99]","[74.99, 99.99]","[0, 0]","[ZO0360303603, ZO0272002720]",174.98,174.98,2,2,order,diane
O9om64IBP2tbyBgXA2Qv,"[Men's Clothing, Men's Accessories]",EUR,NaT,Eddie,Eddie Weber,MALE,38,Weber,,Monday,...,"[59.99, 20.99]","[59.99, 20.99]","[0, 0]","[ZO0542505425, ZO0601306013]",80.98,80.98,2,2,order,eddie


Just like in pandas, the output of a DataFrame method is another DataFrame, allowing for methods to be chained:

In [22]:
ecommerce_oml.filter(regex='produ.', axis=1)[['products.base_price', 'products.category']].head()

Unnamed: 0,products.base_price,products.category
N9om64IBP2tbyBgXA2Qu,"[11.99, 24.99]","[Men's Clothing, Men's Clothing]"
ONom64IBP2tbyBgXA2Qu,"[24.99, 28.99]","[Women's Clothing, Women's Clothing]"
Odom64IBP2tbyBgXA2Qv,"[99.99, 99.99]","[Women's Shoes, Women's Clothing]"
Otom64IBP2tbyBgXA2Qv,"[74.99, 99.99]","[Women's Shoes, Women's Clothing]"
O9om64IBP2tbyBgXA2Qv,"[59.99, 20.99]","[Men's Clothing, Men's Accessories]"


In [23]:
ecommerce_oml.groupby(['day_of_week', 'day_of_week_i', 'type']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,products.base_price,products.base_unit_price,products.discount_amount,products.discount_percentage,products.min_price,products.price,products.product_id,products.quantity,products.tax_amount,products.taxful_price,products.taxless_price,products.unit_discount_amount,taxful_total_price,taxless_total_price,total_quantity,total_unique_products
day_of_week,day_of_week_i,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Friday,4,order,34.665464,34.665464,0.074866,0.118835,17.296444,34.590598,16526.074272,1.0,0.0,34.590598,34.590598,0.074866,75.604662,75.604662,2.185714,2.185714
Monday,0,order,35.963039,35.073007,0.037975,0.063291,17.419775,35.925064,16824.3125,1.003165,0.0,35.925064,35.925064,0.037975,78.428824,78.428824,2.189983,2.183074
Saturday,5,order,34.357962,34.357962,0.064331,0.101911,17.137746,34.293631,16746.33949,1.0,0.0,34.293631,34.293631,0.064331,73.15358,73.15358,2.133152,2.133152
Sunday,6,order,35.278721,35.278721,0.144828,0.229885,17.529815,35.133893,16823.595402,1.0,0.0,35.133893,35.133893,0.144828,74.674356,74.674356,2.125407,2.125407
Thursday,3,order,34.680409,34.680409,0.127316,0.203228,17.300117,34.553093,16895.474597,1.0,0.0,34.553093,34.553093,0.127316,74.590161,74.590161,2.15871,2.15871
Tuesday,1,order,34.335716,34.335716,0.151492,0.244836,17.174665,34.184224,16484.140015,1.0,0.0,34.184224,34.184224,0.151492,73.364333,73.364333,2.146141,2.146141
Wednesday,2,order,35.193203,35.193203,0.111284,0.171206,17.576341,35.081919,16470.031907,1.0,0.0,35.081919,35.081919,0.111284,76.150179,76.150179,2.170608,2.170608


unlike pandas DataFrames, opensearch-py-ml does not allow for assignment in DataFrames (since the data itself is being pulled from OpenSearch).

In [24]:
ecommerce_oml['taxless_total_price'] = ecommerce_oml['taxless_total_price'] * 1.15

TypeError: 'DataFrame' object does not support item assignment

In [25]:
ecommerce_pd = ecommerce_oml.head().to_pandas()

In [26]:
ecommerce_pd['taxless_total_price'] = ecommerce_pd['taxless_total_price'] * 1.15

In [27]:
ecommerce_pd['taxless_total_price']

N9om64IBP2tbyBgXA2Qu     42.527
ONom64IBP2tbyBgXA2Qu     62.077
Odom64IBP2tbyBgXA2Qv    229.977
Otom64IBP2tbyBgXA2Qv    201.227
O9om64IBP2tbyBgXA2Qv     93.127
Name: taxless_total_price, dtype: float64

# Step 3: Showcase SageMaker Support

We'll showcase SageMaker integration by uploading an `opensearch-py-ml` DataFrame to a deployed SageMaker endpoint. The particular problem we'll try to solve is the classic MNIST digit classification problem, where a 28x28 grayscale image is sent to our ML algorithm, and the algorithm spits out a 10-long vector representing the probability the image represents a certain digit (0-9).

![title](mnist-3.0.1.png)

The model we've deployed is a LightGBM model for digit classification: https://lightgbm.readthedocs.io/en/v3.3.2/

In [None]:
# step 1: preprocess the data we'd like to send to endpoint in pandas
newline, bold, unbold = '\n', '\033[1m', '\033[0m'

# read the data
test_data = pd.read_csv('truncated_data.csv', index_col='Unnamed: 0')
test_data.columns = ['Target'] + [f"Feature_{i}" for i in range(1, test_data.shape[1])]

num_examples, num_columns = test_data.shape
print(f"{bold}The test dataset contains {num_examples} examples and {num_columns} columns.{unbold}\n")

# prepare the ground truth target and predicting features to send into the endpoint.
ground_truth_label, features = test_data.iloc[:, :1], test_data.iloc[:, 1:]

print(f"{bold}The first 5 observations of the data: {unbold} \n")
test_data.head(5)

uploading pandas DataFrames as OpenSearch indices are as simple as a one-line call:

In [None]:
oml.pandas_to_opensearch(test_data, client, 'sagemaker_demo_data')

In [None]:
oml_data = oml.DataFrame(client, 'sagemaker_demo_data')

In [None]:
oml_data.head()

In [None]:
oml_data.columns

note that because individual fields in OpenSearch are saved as mappings, which have no explicit order, we need to specify a column order when uploading data to our SageMaker endpoint.

In [None]:
indices, preds = oml.make_sagemaker_prediction('jumpstart-dft-lgb-classification-model', 
                                     oml_data,
                                     'Target',
                                     column_order = list(test_data.drop(columns='Target').columns),
                                     chunksize=1000)

In [None]:
preds = [json.loads(item)['probabilities'] for item in preds]
preds = np.concatenate([np.array(item) for item in preds])

In [None]:
preds

In [None]:
# take most likely digit for each test input
preds = np.argmax(preds, axis=1)

# return in order of the indices
s = pd.Series(preds, indices)
s.index = [int(item) for item in s.index]
np.array(s.sort_index()[:100].values)

# Step 4: Showcase ml-commons Support

all ML-related functionality in `opensearch-py-ml` is stored in a separate client:

In [None]:
ml_client = MLCommonClient(client)

In [None]:
ml_client.put_model("sentence_transformer3.zip", "sentence_transformer", 1, verbose=True)