In [2]:
from elasticsearch import Elasticsearch
from elasticsearch_llm_cache import ElasticsearchLLMCache, ElasticsearchLLMFilter
from elasticsearch.exceptions import NotFoundError

# common libraries
from dotenv import load_dotenv
import os
from os import environ
import openai
from icecream import ic

# load our environment file
load_dotenv()

es_url = f"https://{os.environ['elasticsearch_user']}:{os.environ['elasticsearch_pw']}@{os.environ['elasticsearch_host']}:{os.environ['elasticsearch_port']}"
# es_index= os.environ['elasticsearch_index']
os.environ['OPENAI_API_KEY'] = os.environ['openai_api_key']
open_api_key=os.environ['OPENAI_API_KEY']

# define our API Key
openai.api_key = os.getenv("openai_api_key")


In [7]:
es = Elasticsearch([es_url])

cache_index_name = 'llm_cache_test'
filter_index_name = 'llm_filter_test'
model_id = 'sentence-transformers__msmarco-minilm-l-12-v3'

In [8]:
import os
import time
#print(os.environ['ELASTIC_CLOUD_ID'])
#time.sleep(10)
from elasticsearch import Elasticsearch

from elasticsearch_llm_cache import (
    ElasticsearchLLMCache,  
    ElasticsearchLLMFilter,
)

from pprint import pprint
import time

es_client = es

if es_client.indices.exists(index=cache_index_name):
    es_client.indices.delete(index=cache_index_name)
    ic(f'{cache_index_name} exists, deleting.')

if es_client.indices.exists(index=filter_index_name):
    es_client.indices.delete(index=filter_index_name)
    ic(f'{filter_index_name} exists, deleting.')

# es_client.indices.create(index=cache_index_name)
# es_client.indices.create(index=filter_index_name)


# Initialize your caching class
cache = ElasticsearchLLMCache(es_client=es_client, index_name=cache_index_name, es_model_id=model_id, create_index=False)
cache.create_index(dims=384)

filter = ElasticsearchLLMFilter(es_client=es_client, index_name=filter_index_name, es_model_id=model_id, create_index=False)
filter.create_index(dims=384)


INFO:elastic_transport.transport:HEAD https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/llm_cache_test [status:200 duration:0.153s]
INFO:elastic_transport.transport:DELETE https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/llm_cache_test [status:200 duration:0.093s]
ic| f'{cache_index_name} exists, deleting.': 'llm_cache_test exists, deleting.'
INFO:elastic_transport.transport:HEAD https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/llm_filter_test [status:200 duration:0.033s]
INFO:elastic_transport.transport:DELETE https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/llm_filter_test [status:200 duration:0.092s]
ic| f'{filter_index_name} exists, deleting.': 'llm_filter_test exists, deleting.'
INFO:elastic_transport.transport:HEAD https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/llm_cache_test [status:404 duration:0.033s]
INFO:elastic_transport.transport:PUT https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/llm_cache_test [status:200 duration:0.150s]
ic| f

{'cache_index': 'llm_filter_test', 'created_new': True}

In [None]:
!eland_import_hub_model --url "$es_url" \
      --hub-model-id "sentence-transformers/msmarco-MiniLM-L-12-v3" \
      --task-type "text_embedding"

In [48]:
from elasticsearch.client import MlClient

def list_models(es):
    
    models = MlClient.get_trained_models(es)
    ids = [item['model_id'] for item in models['trained_model_configs']]
    return ids

list_models(es_client)

def is_model_valid(es, model_name = ''):
    return  model_name in list_models(es)

is_model_valid(es_client, 'lang_ident_model_11')


INFO:elastic_transport.transport:GET https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models [status:200 duration:0.519s]
INFO:elastic_transport.transport:GET https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models [status:200 duration:0.049s]


False

# Third Party fill-mask


# Question and Answer

# Text Embedding



In [62]:
from elasticsearch.client import MlClient

def text_embedding(text, es, model_id="sentence-transformers__all-distilroberta-v1"):

    if not is_model_valid(es, model_id):
        raise ValueError(f"{model_id} is not a valid model.")

    doc_test = {"text_field": text}

    result = MlClient.infer_trained_model(es, model_id=model_id, docs=doc_test)

    return result["inference_results"]

res = text_embedding(es=es, text="The markets rallied today on news of lower inflation")
res

INFO:elastic_transport.node_pool:Resurrected node <Urllib3HttpNode(https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243)> (force=False)
INFO:elastic_transport.transport:GET https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models [status:200 duration:0.052s]
INFO:elastic_transport.transport:POST https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models/sentence-transformers__all-distilroberta-v1/_infer [status:200 duration:0.871s]


[{'predicted_value': [0.0013899048790335655,
   -0.017232872545719147,
   0.019243240356445312,
   0.05122537910938263,
   -0.023524880409240723,
   0.007755241356790066,
   -0.004405466839671135,
   -0.05961545184254646,
   0.016164101660251617,
   -0.0017298588063567877,
   -0.022026458755135536,
   0.060728203505277634,
   -0.04022039473056793,
   -0.07409512251615524,
   -0.04334405064582825,
   0.05003625154495239,
   -0.014044659212231636,
   0.041540276259183884,
   -0.011980040930211544,
   0.019256336614489555,
   0.004635554272681475,
   0.032987236976623535,
   -0.010751636698842049,
   -0.006894468795508146,
   0.011070616543293,
   -0.04124922677874565,
   -0.03224963694810867,
   0.012982227839529514,
   -0.012744796462357044,
   -0.038234807550907135,
   0.027189340442419052,
   -0.012296980246901512,
   0.008654959499835968,
   0.00820642989128828,
   -0.014097893610596657,
   0.01924149878323078,
   0.043992530554533005,
   0.03691142052412033,
   -0.02880031056702137,

# Text Classification

## 1. Financial Sentiment Analysis

You provide some text and get back "Positive", "Negative", or "Neutra" and a probability of a match.

In [57]:
!eland_import_hub_model --url "$es_url" \
      --hub-model-id "ProsusAI/finbert" \
      --task-type "text_classification" \
      --start

2023-12-01 16:05:01,103 INFO : Establishing connection to Elasticsearch
2023-12-01 16:05:01,465 INFO : Connected to cluster named '4dadf200942c4f3fb6113618e49a559c' (version: 8.11.0)
2023-12-01 16:05:01,467 INFO : Loading HuggingFace transformer tokenizer and model 'ProsusAI/finbert'
Downloading tokenizer_config.json: 100%|████████| 252/252 [00:00<00:00, 486kB/s]
Downloading config.json: 100%|█████████████████| 758/758 [00:00<00:00, 10.6MB/s]
Downloading vocab.txt: 100%|█████████████████| 232k/232k [00:00<00:00, 3.46MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████| 112/112 [00:00<00:00, 166kB/s]
Downloading pytorch_model.bin: 100%|█████████| 438M/438M [00:05<00:00, 81.1MB/s]
2023-12-01 16:05:12,923 INFO : Creating model with id 'prosusai__finbert'
2023-12-01 16:05:12,977 INFO : Uploading model definition
100%|█████████████████████████████████████| 418/418 [01:33<00:00,  4.47 parts/s]
2023-12-01 16:06:46,495 INFO : Uploading model vocabulary
2023-12-01 16:06:46,715 INFO : Starting

In [58]:
from elasticsearch import Elasticsearch
from elasticsearch.client import MlClient


def single_label_classify(text, es, model_id="prosusai__finbert"):

    if not is_model_valid(es, model_id):
        raise ValueError(f"{model_id} is not a valid model.")

    doc_test = {"text_field": text}

    result = MlClient.infer_trained_model(es, model_id=model_id, docs=doc_test)

    return result["inference_results"]

res = single_label_classify(es=es, text="The markets rallied today on news of lower inflation")
res

INFO:elastic_transport.transport:GET https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models [status:200 duration:0.412s]
INFO:elastic_transport.transport:POST https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models/prosusai__finbert/_infer [status:200 duration:0.800s]
ic| result: ObjectApiResponse({'inference_results': [{'predicted_value': 'positive', 'prediction_probability': 0.9203204906696552}]})


[{'predicted_value': 'positive', 'prediction_probability': 0.9203204906696552}]

## 2. Sentiment analysis
You provide some text and get back "Positive" or "Negative" and a probability of a match.

In [None]:
!eland_import_hub_model --url "$es_url" \
      --hub-model-id "distilbert-base-uncased-finetuned-sst-2-english" \
      --task-type "text_classification" \
      --start

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.client import MlClient


def single_label_classify(text, es, model_id="distilbert-base-uncased-finetuned-sst-2-english"):

    if not is_model_valid(es, model_id):
        raise ValueError(f"{model_id} is not a valid model.")

    doc_test = {"text_field": text}

    result = MlClient.infer_trained_model(es, model_id=model_id, docs=doc_test)

    return result["inference_results"]

res = single_label_classify(es=es, text="This totally, totally, totally sucks")
res

INFO:elastic_transport.transport:GET https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models [status:200 duration:0.050s]
INFO:elastic_transport.transport:POST https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models/distilbert-base-uncased-finetuned-sst-2-english/_infer [status:200 duration:0.037s]
ic| result: ObjectApiResponse({'inference_results': [{'predicted_value': 'NEGATIVE', 'prediction_probability': 0.9941348437530528}]})


[{'predicted_value': 'NEGATIVE', 'prediction_probability': 0.9941348437530528}]

## 3. Hate Speech Detection
You provide some text and get back "HATE" or "NON_HATE" and a probability of a match.

In [5]:
!eland_import_hub_model --url "$es_url" \
      --hub-model-id "Hate-speech-CNERG/dehatebert-mono-english" \
      --task-type "text_classification" \
      --start

2023-11-30 17:50:12,935 INFO : Establishing connection to Elasticsearch
2023-11-30 17:50:13,549 INFO : Connected to cluster named '4dadf200942c4f3fb6113618e49a559c' (version: 8.11.0)
2023-11-30 17:50:13,551 INFO : Loading HuggingFace transformer tokenizer and model 'Hate-speech-CNERG/dehatebert-mono-english'
2023-11-30 17:50:20,419 ERROR : Trained model with id 'hate-speech-cnerg__dehatebert-mono-english' already exists
2023-11-30 17:50:20,419 INFO : Run the script with the '--clear-previous' flag if you want to overwrite the existing model.


In [55]:
from elasticsearch import Elasticsearch
from elasticsearch.client import MlClient


def hate_speech_classify(text, es, model_id="hate-speech-cnerg__dehatebert-mono-english"):

    if not is_model_valid(es, model_id):
        raise ValueError(f"{model_id} is not a valid model.")

    #Run a query againt the model - this is the format the query imput must be used in, you can later map your features into this format through an ingest pipeline
    doc_test = {"text_field": text}

    result = MlClient.infer_trained_model(es, model_id =model_id, docs=doc_test)
    
    return result["inference_results"]

res = hate_speech_classify(es=es, text="All immigrants should be honored.")
res

INFO:elastic_transport.transport:GET https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models [status:200 duration:0.371s]
INFO:elastic_transport.transport:POST https://demo-defc18.es.us-central1.gcp.cloud.es.io:9243/_ml/trained_models/hate-speech-cnerg__dehatebert-mono-english/_infer [status:200 duration:0.036s]
ic| result: ObjectApiResponse({'inference_results': [{'predicted_value': 'NON_HATE', 'prediction_probability': 0.7693460967744596}]})


[{'predicted_value': 'NON_HATE', 'prediction_probability': 0.7693460967744596}]

# Text Similarity

# Zero-shot classification

You provide some text and some labels that could potentially describe the text.
This will return an array with all the labels that matched (class name) and the probability of the match.
Optionally, you can provide a threshold where values lower than it will be ignored.

In [None]:
!eland_import_hub_model --url "$es_url" \
      --hub-model-id "valhalla/distilbart-mnli-12-6" \
      --task-type "zero_shot_classification"

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.client import MlClient

def zero_shot_classify(text, labels, es, model_id = "valhalla__distilbart-mnli-12-6", threshold = 0.5):

  if not is_model_valid(es, model_id):
      raise ValueError(f"{model_id} is not a valid model.")


  doc_test = {"text_field": text}
  inference_config = {
      "zero_shot_classification": {
        "labels": labels,
        "multi_label": True
      }
    }

  result = MlClient.infer_trained_model(es, model_id =model_id, docs = doc_test, inference_config=inference_config)

  filtered_results = {}

  filtered_data = [item for item in result['inference_results'][0]['top_classes'] if item['class_probability'] >= threshold]

  return filtered_data

res = zero_shot_classify(es=es, text="Our city councilman is going to be at the event.", labels=["sports", "money", "family", "politics"])
res

In [12]:
res = filter(es=es, text="Tell me about the plant that blooms and has petals", labels=["sports", "money", "family", "flowers", "politics"], threshold=0.75)
res


TypeError: 'ElasticsearchLLMFilter' object is not callable