In [2]:
# Upgrade Oracle ADS to pick up latest features and maintain compatibility with Oracle Cloud Infrastructure.

!pip install -U oracle-ads
!pip install --upgrade oci



In [3]:
# 필요 라이브러리 Import

import ads
import logging
import os
import tempfile
import warnings
import json
import base64
import numpy as np
import pandas as pd
import nltk
nltk.download('omw-1.4')
import joblib

import ocifs
import oci
from ocifs import OCIFileSystem

from ads.catalog.model import ModelCatalog
from ads.common.model_metadata import UseCaseType
from ads.common.model_artifact import ModelArtifact
from ads.common.model_export_util import prepare_generic_model
from ads.model.framework.sklearn_model import SklearnModel

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from nltk.stem import WordNetLemmatizer

# Using resource principal to authenticate when using the model catalog. By default, ADS 
# uses the config+key flow to authenticate against the model catalog. Remove this line 
# if you want to use the config+key authn method. 
ads.set_auth(auth='resource_principal') 

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/datascience/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


  from ads.common.model_metadata import UseCaseType

  from ads.common.model_export_util import prepare_generic_model



<font color=gray>Oracle Data Science service sample notebook.

Copyright (c) 2023 Oracle, Inc.  All rights reserved.
Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
</font>

***
# <font color=red>Sklearn으로 만들 모델 배포하기</font>
<p style="margin-left:10%; margin-right:10%;">by the <font color=teal> Oracle Cloud Infrastructure Data Science Service Team </font></p>

***

## Overview:

Accelerated Data Science(ADS)의 'SklearnModel' 클래스는 모델을 신속하게 생성하고 배포할 수 있도록 설계되었습니다. 
- `.prepare()` 메서드는 모델을 구성하거나 코드를 작성할 필요 없이 작동하는 모델을 배포하는 데 필요한 모델 아티팩트를 생성합니다. 
- `score.py` 파일은 필요에 따라 사용자 정의할 수 있습니다. 
- `.verify()` 메서드를 사용하여 배포된 모델에 대한 호출을 시뮬레이션합니다. 이 메서드는 `score.py` 파일에서 `load_model()` 및 `predict()` 함수를 호출합니다. `.verify()`를 사용하면 모델을 배포하지 않고도 `score.py` 파일을 디버그할 수 있습니다. 
- `.save()` 메서드는 `SklearnModel` 및 모델 아티팩트를 모델 카탈로그로 푸시합니다. 
- `.deploy()` 메서드는 모델을 REST 끝점에 배포합니다. 
- `.predict()` 메서드를 사용하면 엔드포인트를 호출하여 모델 추론을 수행할 수 있습니다.

---

<a id='intro'></a>
# Introduction

## Authenticate

OCI Data Science 서비스에 대한 인증이 필요합니다. 여기서는 기본적으로 리소스 주체를 사용합니다.

In [4]:
ads.set_auth(auth="resource_principal")

<a id="intro_dataset"></a>
## Create Dataset

In [5]:
# Bucket에 있는 크롤링 데이터 import
# 한국어 및 테스트 데이터 삭제 
# nan과 null 데이터 처리를 위해 desc열만 dataframe으로 변환 

crawled_parquet_df = pd.read_json("oci://crawled_data@apackrsct01/enhanced_livelabs.json")
df_final=crawled_parquet_df.drop([402,409,462,774,788])
df_final1 = pd.DataFrame({'document':df_final.desc})
df_final2 = df_final1.dropna()
df_final3 = df_final2[df_final2['document'].astype(bool)]
crawled_final = list(df_final3.document.values)

In [6]:
# CountVectorizer로 벡터화기 구현
# 최종 데이터 crwaled_final을 transform 후 LDA 알고리즘으로 Topic Modeling 생성

lemm = WordNetLemmatizer()

class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))
    
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')

tf = tf_vectorizer.fit_transform(crawled_final)

model = LatentDirichletAllocation(n_components=15, max_iter=10,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)
model.fit(tf)

In [7]:
#Topic modeling data 결과

doc_topics=model.transform(tf)

topic_index=list(df_final3.index.values)
topic_names = ['Topic #'+str(i) for i in range(0,15)]
topic_df = pd.DataFrame(data=doc_topics, columns=topic_names,index=topic_index)

dominant_topic = np.argmax(topic_df.values, axis=1)
topic_df['dominant_topic'] = dominant_topic
topic_df.head(20)

Unnamed: 0,Topic #0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8,Topic #9,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14,dominant_topic
0,0.001212,0.001212,0.440892,0.461027,0.001212,0.001212,0.001212,0.001212,0.083535,0.001212,0.001212,0.001212,0.001212,0.001212,0.001212,3
1,0.002299,0.002299,0.002299,0.195667,0.002299,0.002299,0.774447,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,6
2,0.002299,0.002299,0.349957,0.16879,0.002299,0.002299,0.002299,0.002299,0.453667,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,8
3,0.001058,0.001058,0.985185,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,2
4,0.003922,0.003922,0.003922,0.945098,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,3
5,0.001626,0.4529,0.001626,0.525962,0.001626,0.001626,0.001626,0.001626,0.001626,0.001626,0.001626,0.001626,0.001626,0.001626,0.001626,3
6,0.001587,0.001587,0.977778,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,2
7,0.002083,0.002083,0.002083,0.547624,0.002083,0.002083,0.002083,0.002083,0.425292,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,3
8,0.003509,0.003509,0.664176,0.237519,0.003509,0.003509,0.056199,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,2
9,0.002381,0.002381,0.002381,0.966667,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,3


In [8]:
X=topic_df.drop(['dominant_topic'],axis=1)
y=topic_df['dominant_topic']

In [9]:
#importing train_test_split
from sklearn.model_selection import train_test_split
trainx, trainy, testx, testy = train_test_split(X,y)

## 모델 생성

In [10]:
model = KMeans(
     init="random",
     n_clusters=15,
     n_init=10,
     max_iter=300,
     random_state=42
 )

model.fit(trainx)

## 모델 준비

데이터를 위한 임시 디렉토리 생성

In [11]:
artifact_dir = tempfile.mkdtemp()
sklearn_model = SklearnModel(estimator=model, artifact_dir=artifact_dir)
sklearn_model.prepare(
    inference_conda_env="generalml_p38_cpu_v1",
    training_conda_env="generalml_p38_cpu_v1",
    use_case_type=UseCaseType.CLUSTERING,
    X_sample=trainx,
    y_sample=trainy,
    force_overwrite=True,
)




algorithm: KMeans
artifact_dir:
  /tmp/tmpz2yk05_v:
  - - score.py
    - model.joblib
    - input_schema.json
    - output_schema.json
    - .model-ignore
    - runtime.yaml
framework: scikit-learn
model_deployment_id: null
model_id: null

In [12]:
sklearn_model.summary_status()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Actions Needed
Step,Status,Details,Unnamed: 3_level_1
initiate,Done,Initiated the model,
prepare(),Done,Generated runtime.yaml,
prepare(),Done,Generated score.py,
prepare(),Done,Serialized model,
prepare(),Done,"Populated metadata(Custom, Taxonomy and Provenance)",
verify(),Available,Local tested .predict from score.py,
save(),Available,Conducted Introspect Test,
save(),Available,Uploaded artifact to model catalog,
deploy(),UNKNOWN,Deployed the model,
predict(),Not Available,Called deployment predict endpoint,


## Verify

verify 메서드는 artifact_dir의 ``score.py`` 내부에 정의된 ``predict`` 함수를 호출합니다.

In [13]:
sklearn_model.verify(trainx[:10])

Start loading model.joblib from model directory /tmp/tmpz2yk05_v ...
Model is successfully loaded.


{'prediction': [2, 3, 1, 2, 6, 2, 2, 14, 3, 5]}

## Save

In [14]:
sklearn_model.save(display_name="livelab-clustering")

Start loading model.joblib from model directory /tmp/tmpz2yk05_v ...
Model is successfully loaded.
['score.py', 'model.joblib', 'input_schema.json', 'output_schema.json', '.model-ignore', 'runtime.yaml']


loop1:   0%|          | 0/4 [00:00<?, ?it/s]

'ocid1.datasciencemodel.oc1.ap-tokyo-1.amaaaaaavsea7yiazcjt45tzoqnkf25f3rtcjxayacb6txod6d6qzrx3hqgq'

In [15]:
sklearn_model.summary_status()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Actions Needed
Step,Status,Details,Unnamed: 3_level_1
initiate,Done,Initiated the model,
prepare(),Done,Generated runtime.yaml,
prepare(),Done,Generated score.py,
prepare(),Done,Serialized model,
prepare(),Done,"Populated metadata(Custom, Taxonomy and Provenance)",
verify(),Done,Local tested .predict from score.py,
save(),Done,Conducted Introspect Test,
save(),Done,Uploaded artifact to model catalog,
deploy(),UNKNOWN,Deployed the model,
predict(),Not Available,Called deployment predict endpoint,


## Deploy

모델이 모델 카탈로그에 있으면 모델의 `.deploy()` 메서드를 사용하여 배포할 수 있습니다. 
이 방법을 사용하면 표시 이름, 설명, 인스턴스 유형 및 개수, 최대 대역폭, 로깅 그룹과 같은 배포 속성을 지정할 수 있습니다. 
다음 셀은 사용자 지정 표시 이름을 제외한 기본 설정으로 모델을 배포합니다. 
`.deploy()` 메서드는 `ModelDeployment` 개체를 반환합니다.

In [16]:
deploy = sklearn_model.deploy(
    display_name="KMeans Clustering for Crawled Livelabs",
)

loop1:   0%|          | 0/6 [00:00<?, ?it/s]

In [17]:
sklearn_model.summary_status()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Actions Needed
Step,Status,Details,Unnamed: 3_level_1
initiate,Done,Initiated the model,
prepare(),Done,Generated runtime.yaml,
prepare(),Done,Generated score.py,
prepare(),Done,Serialized model,
prepare(),Done,"Populated metadata(Custom, Taxonomy and Provenance)",
verify(),Done,Local tested .predict from score.py,
save(),Done,Conducted Introspect Test,
save(),Done,Uploaded artifact to model catalog,
deploy(),ACTIVE,Deployed the model,
predict(),Available,Called deployment predict endpoint,


In [18]:
print(f"Endpoint: {sklearn_model.model_deployment.url}")

Endpoint: https://modeldeployment.ap-tokyo-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.ap-tokyo-1.amaaaaaavsea7yia7ndqy54iwzfejqtdar6fvg3dtmwashtfxqf73drropva


## Predict

배포가 활성화되면 모델 객체에서 `predict()`를 호출하여 배포된 엔드포인트에 요청을 보낼 수 있습니다.

In [32]:
sklearn_model.predict(testx)

'{"code": "InternalServerError", "message": "Expected 2D array, got 1D array instead:\\narray=[ 2.  2.  3.  2.  3.  2.  2.  2.  2.  8.  3.  5.  2.  8.  7.  2.  2.  8.\\n  2.  2.  3.  8.  3.  2.  3.  2.  3.  3.  2.  2.  3.  3.  3.  2.  2.  3.\\n  3.  3.  3.  2.  2.  3.  8.  3.  2.  2.  3.  2.  2.  8.  0.  3.  2.  5.\\n  3.  2.  8.  3.  2.  2.  2.  2.  2.  3.  8.  2.  8.  3.  8.  2.  2.  2.\\n  8.  2.  3.  2. 11.  2.  3.  3.  2.  4.  3.  3.  8.  8.  2.  3.  3.  5.\\n  8.  6.  8.  3.  2.  3.  2.  2.  3.  3.  2.  2.  5.  2.  2.  6.  5.  2.\\n  2.  2.  5.  2.  3.  3.  6.  3.  3.  3.  8.  2.  2.  2.  2.  3.  2.  2.\\n  2.  2.  2.  0.  2.  2.  2.  2.  3.  2.  3.  3.  2.  8.  3.  3.  2.  3.\\n  3.  2.  3.  2.  2.  2.  2.  2.  2.  2.  3.  3.  3.  2.  2.  2.  2.  2.\\n  3.  3.  2.  3.  3.  2.  8.  3.  2.  6.  2.  2.  2.  8.  3.  2.  2.  5.\\n  5.  3.  3.  3.  2.  3.  2.  2.  2.  8.  3.  3.  2.  2.  3.  3.  2.  2.\\n  3.  3.  2.  3.  3.  2.  2.  8.  2.  2.  2.  2.  8.  3.  3.  2.  2.  3.\\n  3.  

<a id='clean_up'></a>
# Clean Up

이 노트북은 모델 배포와 모델을 만들었습니다. 이 섹션에서는 해당 리소스를 정리합니다.

모델을 삭제하려면 먼저 모델 배포를 삭제해야 합니다. 이를 수행하려면 `SklearnModel` 개체에서 `.delete_deployment()` 메서드를 사용합니다.

In [67]:
delete = sklearn_model.delete_deployment(wait_for_completion=True)

loop1:   0%|          | 0/2 [00:00<?, ?it/s]

모델 배포가 삭제된 후 `.summary_status()` 메서드는 모델이 삭제되었고 `predict()` 메서드를 사용할 수 없음을 보여줍니다.

In [68]:
sklearn_model.summary_status()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Actions Needed
Step,Status,Details,Unnamed: 3_level_1
initiate,Done,Initiated the model,
prepare(),Done,Generated runtime.yaml,
prepare(),Done,Generated score.py,
prepare(),Done,Serialized model,
prepare(),Done,"Populated metadata(Custom, Taxonomy and Provenance)",
verify(),Done,Local tested .predict from score.py,
save(),Done,Conducted Introspect Test,
save(),Done,Uploaded artifact to model catalog,
deploy(),DELETED,Deployed the model,
predict(),Not Available,Called deployment predict endpoint,


`.delete()` 메서드를 사용하여 모델을 삭제합니다.

In [69]:
sklearn_model.delete()

다음 셀은 로컬 드라이브에 저장된 모델 아티팩트를 제거합니다.

In [70]:
rmtree(artifact_dir)

<a id='ref'></a>
# References
- [ADS Library Documentation](https://accelerated-data-science.readthedocs.io/en/latest/index.html)
- [Data Science YouTube Videos](https://www.youtube.com/playlist?list=PLKCk3OyNwIzv6CWMhvqSB_8MLJIZdO80L)
- [OCI Data Science Documentation](https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm)
- [Oracle Data & AI Blog](https://blogs.oracle.com/datascience/)
- [Understanding Conda Environments](https://docs.cloud.oracle.com/en-us/iaas/data-science/using/use-notebook-sessions.htm#conda_understand_environments)
- [Use Resource Manager to Configure Your Tenancy for Data Science](https://docs.cloud.oracle.com/en-us/iaas/data-science/using/orm-configure-tenancy.htm)
- [`runtime.yaml`](https://docs.content.oci.oracleiaas.com/en-us/iaas/data-science/using/model_runtime_yaml.htm#model_runtime_yaml)
- [`score.py`](https://docs.content.oci.oracleiaas.com/en-us/iaas/data-science/using/model_score_py.htm#model_score_py)
- [Model artifact](https://docs.content.oci.oracleiaas.com/en-us/iaas/data-science/using/models_saving_catalog.htm#create-models)
- [ONNX API Summary](http://onnx.ai/sklearn-onnx/api_summary.html)