In [1]:
# Upgrade Oracle ADS to pick up latest features and maintain compatibility with Oracle Cloud Infrastructure.

!pip install -U oracle-ads
!pip install --upgrade oci



In [2]:
# 필요 라이브러리 Import

import ads
import logging
import os
import tempfile
import warnings
import json
import base64
import numpy as np
import pandas as pd
import nltk
nltk.download('omw-1.4')
import joblib

import ocifs
import oci
from ocifs import OCIFileSystem

from ads.catalog.model import ModelCatalog
from ads.common.model_metadata import UseCaseType
from ads.common.model_artifact import ModelArtifact
from ads.common.model_export_util import prepare_generic_model
from ads.model.framework.sklearn_model import SklearnModel

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/datascience/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


  from ads.common.model_metadata import UseCaseType

  from ads.common.model_export_util import prepare_generic_model



In [3]:
ads.set_auth(auth="resource_principal")

<a id="intro_dataset"></a>
## Create Dataset

In [4]:
# ADB에서 crawled 데이터 로딩 
# 아래 USER_NAME, PASSWORD, SERVICE_NAME, WALLET_LOCATION을 본인의 ADB 정보로 변경
# 한국어 및 테스트 데이터 삭제 
# nan과 null 데이터 처리를 위해 desc열만 dataframe으로 변환 

connection_parameters = {
    "user_name": "admin",
    "password": "Welcome123456!",
    "service_name": "o0mqv2sevcx6gmh9_low",
    "wallet_location": "./Wallet_O0MQV2SEVCX6GMH9.zip",
}

crawled_parquet_df = pd.DataFrame.ads.read_sql(
    """
    SELECT
    *
    FROM
    LIVELABS
    """,
    connection_parameters=connection_parameters,
)

df_final=crawled_parquet_df.drop([402,409,462,774,788])
df_final1 = pd.DataFrame({'document':df_final.DESCRIPTION})
df_final2 = df_final1.dropna()
df_final3 = df_final2[df_final2['document'].astype(bool)]
crawled_final = list(df_final3.document.values)

In [5]:
# CountVectorizer로 벡터화기 구현
# 최종 데이터 crwaled_final을 transform 후 LDA 알고리즘으로 Topic Modeling 생성

lemm = WordNetLemmatizer()

class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))
    
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')

tf = tf_vectorizer.fit_transform(crawled_final)

lda = LatentDirichletAllocation(n_components=15, max_iter=10,
                                learning_method = 'online',
                                learning_offset = 10.,
                                random_state = 0)
lda.fit(tf)

In [6]:
#Topic modeling data 결과

doc_topics=lda.transform(tf)

topic_index=list(df_final3.index.values)
topic_names = ['Topic #'+str(i) for i in range(0,15)]
topic_df = pd.DataFrame(data=doc_topics, columns=topic_names,index=topic_index)

dominant_topic = np.argmax(topic_df.values, axis=1)
topic_df['dominant_topic'] = dominant_topic
topic_df.head(20)

Unnamed: 0,Topic #0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8,Topic #9,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14,dominant_topic
0,0.001212,0.001212,0.342298,0.001212,0.066917,0.001212,0.001212,0.001212,0.57624,0.001212,0.001212,0.001212,0.001212,0.001212,0.001212,8
1,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.967816,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,6
2,0.002299,0.002299,0.368516,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.601599,0.002299,13
3,0.001058,0.001058,0.431515,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.001058,0.554728,0.001058,0.001058,0.001058,11
4,0.003922,0.003922,0.003922,0.945098,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,0.003922,3
5,0.001626,0.855744,0.001626,0.001626,0.001626,0.001626,0.001626,0.001626,0.123117,0.001626,0.001626,0.001626,0.001626,0.001626,0.001626,1
6,0.001587,0.001587,0.92459,0.001587,0.001587,0.001587,0.001587,0.054775,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,0.001587,2
7,0.002083,0.002083,0.002083,0.363517,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.6094,0.002083,13
8,0.003509,0.003509,0.614266,0.003509,0.34012,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,0.003509,2
9,0.002381,0.002381,0.002381,0.27632,0.002381,0.002381,0.002381,0.002381,0.692728,0.002381,0.002381,0.002381,0.002381,0.002381,0.002381,8


In [7]:
X=topic_df.drop(['dominant_topic'],axis=1)
y=topic_df['dominant_topic']

In [8]:
#importing train_test_split
from sklearn.model_selection import train_test_split
trainx, trainy, testx, testy = train_test_split(X,y)

## 모델 생성

In [9]:
model = KMeans(
     init="random",
     n_clusters=15,
     n_init=10,
     max_iter=300,
     random_state=42
 )

model.fit(trainx)

## 모델 준비

In [10]:
# artifact_dir = tempfile.mkdtemp()

artifact_dir = './artifact'
sklearn_model = SklearnModel(estimator=model, artifact_dir=artifact_dir)

sklearn_model.prepare(
    inference_conda_env="generalml_p38_cpu_v1",
    training_conda_env="generalml_p38_cpu_v1",
    use_case_type=UseCaseType.CLUSTERING,
    X_sample=trainx,
    y_sample=trainy,
    force_overwrite=True,
)




algorithm: KMeans
artifact_dir:
  /home/datascience/demo/artifact:
  - - score.py
    - .model-ignore
    - model.joblib
    - input_schema.json
    - output_schema.json
    - runtime.yaml
framework: scikit-learn
model_deployment_id: null
model_id: null

In [11]:
sklearn_model.summary_status()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Actions Needed
Step,Status,Details,Unnamed: 3_level_1
initiate,Done,Initiated the model,
prepare(),Done,Generated runtime.yaml,
prepare(),Done,Generated score.py,
prepare(),Done,Serialized model,
prepare(),Done,"Populated metadata(Custom, Taxonomy and Provenance)",
verify(),Available,Local tested .predict from score.py,
save(),Available,Conducted Introspect Test,
save(),Available,Uploaded artifact to model catalog,
deploy(),UNKNOWN,Deployed the model,
predict(),Not Available,Called deployment predict endpoint,


## Verify

In [12]:
sklearn_model.verify(trainx[:10])

Start loading model.joblib from model directory /home/datascience/demo/artifact ...
Model is successfully loaded.


{'prediction': [6, 0, 7, 0, 5, 8, 7, 13, 8, 12]}

## Save

In [13]:
sklearn_model.save(display_name="model-minjikim-v1", ignore_introspection=True)

Start loading model.joblib from model directory /home/datascience/demo/artifact ...
Model is successfully loaded.


loop1:   0%|          | 0/4 [00:00<?, ?it/s]

'ocid1.datasciencemodel.oc1.ap-tokyo-1.amaaaaaavsea7yiailuahde3amudhp6ou32g7bim34mak4jlop4ha5yxhtva'

## Deploy

In [14]:
deploy = sklearn_model.deploy(
    display_name="deployment-minjikim-v1",
)

loop1:   0%|          | 0/6 [00:00<?, ?it/s]

In [15]:
print(f"Endpoint: {sklearn_model.model_deployment.url}")

Endpoint: https://modeldeployment.ap-tokyo-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.ap-tokyo-1.amaaaaaavsea7yiamsoc5qsvdfpwjlgakgj4jgrdcam6iwbxgsfugkrkyajq


# Invoke & Predict 

In [21]:
import requests
import oci
from oci.signer import Signer

auth = oci.auth.signers.get_resource_principals_signer()

In [22]:
#uri = f"<replace-with-your-model-deployment-uri"
uri = f"https://modeldeployment.ap-tokyo-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.ap-tokyo-1.amaaaaaavsea7yiamsoc5qsvdfpwjlgakgj4jgrdcam6iwbxgsfugkrkyajq/predict"
print(uri)

https://modeldeployment.ap-tokyo-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.ap-tokyo-1.amaaaaaavsea7yiamsoc5qsvdfpwjlgakgj4jgrdcam6iwbxgsfugkrkyajq/predict


In [23]:
sample = ["I want to learn MACHINE LEARNING on HEATWAVE"]
sample_vec = tf_vectorizer.transform(sample)
topic_probability_scores = lda.transform(sample_vec)
input_data = pd.DataFrame(topic_probability_scores).to_json()

In [24]:
%%time
response = requests.post(uri, json=input_data, auth=auth)

CPU times: user 15.5 ms, sys: 3 ms, total: 18.5 ms
Wall time: 136 ms


In [25]:
data_points_in_cluster = trainx[model.labels_ == json.loads(response.content)['prediction']]
indices = data_points_in_cluster.index

for index in indices:
    print(crawled_parquet_df.loc[index])
    print( )

ID                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      