### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
# Copy Model from huggingface
# git clone https://huggingface.co/jinaai/jina-embeddings-v2-base-en
# cd jina-embeddings-v2-base-en
# rm *.onnx
# wget https://huggingface.co/jinaai/jina-embeddings-v2-base-en/resolve/main/model-w-mean-pooling.onnx
# cd ..
# mkdir embedding_model_artifact
# mv jina-embeddings-v2-base-en/* embedding_model_artifact/

In [1]:
import tempfile
import ads
from ads.model.generic_model import GenericModel
from config import CONDA_PACK_PATH, LOG_GROUP_ID, EMBEDDING_MODEL_ACCESS_LOG_LOG_ID, EMBEDDING_MODEL_PREDICT_LOG_LOG_ID 
ads.set_auth("resource_principal")


In [2]:

embedding_model = GenericModel( model_file_name="model-w-mean-pooling.onnx" ,artifact_dir="embedding_model_artifact",estimator=None, serialize=False)
embedding_model.summary_status()



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Actions Needed
Step,Status,Details,Unnamed: 3_level_1
initiate,Done,Initiated the model,
prepare(),Available,Generated runtime.yaml,
prepare(),Available,Generated score.py,
prepare(),Available,Serialized model,
prepare(),Available,"Populated metadata(Custom, Taxonomy and Provenance)",
verify(),Not Available,Local tested .predict from score.py,
save(),Not Available,Conducted Introspect Test,
save(),Not Available,Uploaded artifact to model catalog,
deploy(),UNKNOWN,Deployed the model,
predict(),Not Available,Called deployment predict endpoint,


In [3]:
embedding_model.prepare(
        inference_conda_env=CONDA_PACK_PATH,
        inference_python_version = "3.9",
        model_file_name="model.onnx",
        score_py_uri= "embedding_model_score.py",
        force_overwrite=True
     )

INFO:ADS:To auto-extract taxonomy metadata the model must be provided. Supported models: keras, lightgbm, pytorch, sklearn, tensorflow, pyspark, and xgboost.


algorithm: null
artifact_dir:
  /home/datascience/embedding_model_artifact:
  - - config_sentence_transformers.json
    - sentence_bert_config.json
    - vocab.txt
    - README.md
    - model-w-mean-pooling.onnx
    - tokenizer.json
    - model.safetensors
    - runtime.yaml
    - test_json_output.json
    - score.py
    - .model-ignore
    - generation_config.json
    - tokenizer_config.json
    - config.json
    - pytorch_model.bin
    - modules.json
    - special_tokens_map.json
    - model.onnx
    - coreml
    - 1_Pooling
    - coreml/float32_model.mlpackage
    - coreml/float32_model.mlpackage/Manifest.json
    - coreml/float32_model.mlpackage/Data
    - coreml/float32_model.mlpackage/Data/com.apple.CoreML
    - coreml/float32_model.mlpackage/Data/com.apple.CoreML/model.mlmodel
    - coreml/float32_model.mlpackage/Data/com.apple.CoreML/weights
    - coreml/float32_model.mlpackage/Data/com.apple.CoreML/weights/weight.bin
    - 1_Pooling/config.json
framework: null
model_deployment

In [4]:
op=embedding_model.verify(['Sachin scored 300 in t20 world cup final 2024 match and Rohit scored 111 runs in the same match'])
print(op['embeddings']);

Model is successfully loaded.
[[-0.3025473356246948, -0.2631644010543823, 0.33424049615859985, -0.23177050054073334, -0.08931080996990204, 0.48502910137176514, -0.07448308169841766, -0.40402457118034363, -0.5292817950248718, 0.8512760400772095, -0.9099698066711426, -0.3379335105419159, -0.47660312056541443, 0.4009893834590912, 0.03474418818950653, 0.9377697110176086, -0.3208032250404358, -0.2642645537853241, -0.128304123878479, -0.9947263598442078, -0.587630569934845, -0.019148023799061775, -0.26633620262145996, -0.014588565565645695, 0.22666318714618683, 0.514503002166748, 0.22751407325267792, 0.06005936115980148, 0.4696788787841797, 0.27475905418395996, 0.4665174186229706, 0.4529200792312622, -0.24493767321109772, 0.6228052377700806, -0.7501861453056335, -0.4651184380054474, 0.04736478626728058, -0.2261052131652832, 0.437692791223526, 0.3957967758178711, 0.0043040732853114605, 0.34069889783859253, -0.04769007861614227, 0.5361369848251343, -0.38854801654815674, -0.13371390104293823, 0

In [5]:
embedding_model.save(display_name="jina-embedding")

Model is successfully loaded.
['config_sentence_transformers.json', 'sentence_bert_config.json', 'vocab.txt', 'README.md', 'model-w-mean-pooling.onnx', 'tokenizer.json', 'model.safetensors', 'runtime.yaml', 'test_json_output.json', 'score.py', '.model-ignore', 'generation_config.json', 'tokenizer_config.json', 'config.json', 'pytorch_model.bin', 'coreml', 'modules.json', '1_Pooling', 'special_tokens_map.json', 'model.onnx']


loop1:   0%|          | 0/4 [00:00<?, ?it/s]

'ocid1.datasciencemodel.oc1.<ocid>'

In [6]:
embedding_md = embedding_model.deploy(
    display_name = "Jina Embedding Model Deployment",
    deployment_log_group_id = LOG_GROUP_ID,
    deployment_access_log_id = EMBEDDING_MODEL_ACCESS_LOG_LOG_ID,
    deployment_predict_log_id = EMBEDDING_MODEL_PREDICT_LOG_LOG_ID,
)

loop1:   0%|          | 0/6 [00:00<?, ?it/s]

In [12]:
output = embedding_md.predict(['Sachin scored 300 in t20 world cup final 2024 match and Rohit scored 111 runs in the same match'])




In [13]:
print(output['embeddings']);

[[-0.3025473356246948, -0.2631644010543823, 0.33424049615859985, -0.23177050054073334, -0.08931080996990204, 0.48502910137176514, -0.07448308169841766, -0.40402457118034363, -0.5292817950248718, 0.8512760400772095, -0.9099698066711426, -0.3379335105419159, -0.47660312056541443, 0.4009893834590912, 0.03474418818950653, 0.9377697110176086, -0.3208032250404358, -0.2642645537853241, -0.128304123878479, -0.9947263598442078, -0.587630569934845, -0.019148023799061775, -0.26633620262145996, -0.014588565565645695, 0.22666318714618683, 0.514503002166748, 0.22751407325267792, 0.06005936115980148, 0.4696788787841797, 0.27475905418395996, 0.4665174186229706, 0.4529200792312622, -0.24493767321109772, 0.6228052377700806, -0.7501861453056335, -0.4651184380054474, 0.04736478626728058, -0.2261052131652832, 0.437692791223526, 0.3957967758178711, 0.0043040732853114605, 0.34069889783859253, -0.04769007861614227, 0.5361369848251343, -0.38854801654815674, -0.13371390104293823, 0.0659462958574295, 0.559716165