In [1]:
import os

In [2]:
%pwd

'/home/fachruzaini/lazada-id-reviews/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
with open('.env') as f:
    os.environ.update(
        line.strip().split('=') for line in f
)

In [5]:
%pwd

'/home/fachruzaini/lazada-id-reviews'

### Predict Config

This code will be apply in `src/LadazaIDReview/entity/config_entity.py`.

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class PredictionConfig:
    root_dir: Path
    mlflow_tracking_uri: str
    mlflow_model_name: str
    mlflow_deploy_model_alias: Path
    mlflow_vectorizer_model_path: Path
    
    # for development (debug)
    input_test_path: Path
    output_test_path: Path

### Predict Config Manager

This code will be apply in `src/LazadaIDReview/config/configurations.py`.

In [7]:
from LazadaIDReviews.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LazadaIDReviews.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_prediction_config(self) -> PredictionConfig:
        """read training evaluation config file and store as 
        config entity then apply the dataclasses
        
        Returns:
            config: PredictionConfig type
        """
        predict_config = self.config.predict
        
        # for development (debug)
        dump_data_config = self.config.dump_data

        create_directories([predict_config.root_dir])

        config = PredictionConfig(
            root_dir=predict_config.root_dir,
            mlflow_tracking_uri=os.environ["MLFLOW_TRACKING_URI"],
            mlflow_model_name=predict_config.mlflow_model_name,
            mlflow_deploy_model_alias=os.environ["MLFLOW_DEPLOY_MODEL_ALIAS"],
            mlflow_vectorizer_model_path=predict_config.mlflow_vectorizer_model_path,
            
            # for development (debug)
            input_test_path=dump_data_config.input_test_path,
            output_test_path=dump_data_config.output_test_path
        )

        return config

In [9]:
from mlflow.artifacts import download_artifacts
from mlflow import MlflowClient
from mlflow import pyfunc

import joblib

  import pkg_resources  # noqa: TID251


---

**Debug**: Explain when doing prediction in the notebook with MLflow.

In [10]:
config = ConfigurationManager()
predict_config = config.get_prediction_config()

[2025-07-02 15:58:34,234: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-07-02 15:58:34,239: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2025-07-02 15:58:34,242: INFO: common: created directory at: artifacts]
[2025-07-02 15:58:34,257: INFO: common: created directory at: artifacts/predict]


Select the deployed model from MLflow.

In [11]:
client = MlflowClient(tracking_uri=predict_config.mlflow_tracking_uri)
selected_model = client.get_model_version_by_alias(
    predict_config.mlflow_model_name, 
    predict_config.mlflow_deploy_model_alias
)

selected_model.source

'mlflow-artifacts:/1/c9d5e14ac4384fc7a3fba18602467c69/artifacts/models'

In [12]:
loaded_model = pyfunc.load_model(model_uri=selected_model.source)
loaded_model

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

mlflow.pyfunc.loaded_model:
  artifact_path: models
  flavor: mlflow.sklearn
  run_id: c9d5e14ac4384fc7a3fba18602467c69

Get the model `run_id`.

In [13]:
selected_run_id = selected_model.run_id
selected_run_id

'c9d5e14ac4384fc7a3fba18602467c69'

Download vectorizer (one of the MLflow artifact) from MLflow.

In [14]:
download_artifacts(
    run_id=selected_run_id,
    artifact_path=predict_config.mlflow_vectorizer_model_path,
    dst_path=predict_config.root_dir
)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

'/home/fachruzaini/lazada-id-reviews/artifacts/predict/vectorizer/vectorizer.pkl'

Load the downloaded vectorizer.

In [15]:
root_dir = predict_config.root_dir
mlflow_vectorizer_model_path = predict_config.mlflow_vectorizer_model_path
vectorizer_model_path = f"{root_dir}/{mlflow_vectorizer_model_path}"
vectorizer = joblib.load(vectorizer_model_path)
vectorizer

Predict and evaluate the data test as input.

In [16]:
X_test = joblib.load(predict_config.input_test_path)
y_test = joblib.load(predict_config.output_test_path)

In [17]:
X_test.head()

164205    barang sudah di terima dengan baik, dah di cob...
159779                        sangat puas trimakasih lazada
73156     D pasang ke SS J2 nggak bisa. Apa kartuny yg r...
55923     rekomendet bgt lahh barang bagus cpt bgt sampe...
72584     Film ato lagu2 saat d putar dri plashdisk erro...
Name: reviewContent, dtype: object

In [18]:
X_test.shape

(85624,)

The request body, for the preparation of the http input request body.

In [19]:
request_body = {
    "reviewContent": X_test.to_list()
}

In [20]:
request_body['reviewContent'][:10]

['barang sudah di terima dengan baik, dah di coba Oke.terima kasih Ladaza',
 'sangat puas trimakasih lazada',
 'D pasang ke SS J2 nggak bisa. Apa kartuny yg rusak?',
 'rekomendet bgt lahh barang bagus cpt bgt sampe y padahal tmpat aq desa plosok bgt tpi cuman 2hri dah sampe',
 'Film ato lagu2 saat d putar dri plashdisk error..smuanya filmku error gx bsa d putar😭😭😭',
 'bagus',
 'Toshiba 1 TB Hitam dengan banyak GRATIS Usb TOSHIBA 32GB + Pouch Harddisk & Usb OTG Reader Android NICE !!!!',
 'Produk original dan awet',
 'Barang bagus mulus wlau hnya paking buble warp, kulitas lumayan lh dgn harga sgitu.. Rekomended bgt ni brg..',
 'mantabb, barang bagus sesuai pesanan dan datangnya cepat 👍👍']

Vectorize the data test as input.

In [21]:
X_test_vec = vectorizer.transform(request_body['reviewContent'])
X_test_vec

<85624x13532 sparse matrix of type '<class 'numpy.float64'>'
	with 982914 stored elements in Compressed Sparse Row format>

Make prediction.

In [22]:
y_predict = loaded_model.predict(X_test_vec).tolist()

In [23]:
len(y_predict)

85624

In [24]:
y_predict[:10]

[5, 5, 1, 5, 2, 5, 1, 5, 5, 5]

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           1       0.59      0.74      0.65      5539
           2       0.27      0.50      0.35      1711
           3       0.27      0.48      0.35      3529
           4       0.27      0.47      0.34      8492
           5       0.92      0.75      0.83     66353

    accuracy                           0.71     85624
   macro avg       0.46      0.59      0.50     85624
weighted avg       0.79      0.71      0.74     85624



---

### Make Prediction

This code in `src/LazadaIDReview/components/predict.py`.

In [26]:
from LazadaIDReviews import logger

class Predict:
    def __init__(self, config: PredictionConfig):
        self.config = config

    def run(self, data: list) -> list:
        """predict the data with linear regression model
        
        Args:
            data (list): input data to predict

        Raises:
            client_error: error when access mlflow to get deployed model
            download_error: error when download vectorizer from mlflow artifact
            load_error: vectorizer error
        
        Returns:
            y_predict: list type
        """
        try:
            logger.info("Set MLflow Client.")
            client = MlflowClient(tracking_uri=self.config.mlflow_tracking_uri)
            
            logger.info("Select the deployed model from MLflow.")
            selected_model = client.get_model_version_by_alias(
                self.config.mlflow_model_name, 
                self.config.mlflow_deploy_model_alias
            )
            
            logger.info("Get the deployed model run id.")
            selected_run_id = selected_model.run_id
        except Exception as client_error:
            logger.error(client_error)
            raise client_error
        
        root_dir = self.config.root_dir
        mlflow_vectorizer_model_path = self.config.mlflow_vectorizer_model_path
        vectorizer_model_path = Path(f"{root_dir}/{mlflow_vectorizer_model_path}")
        
        try:
            logger.info("Downloading vectorizer from MLflow's artifacts.")
            download_artifacts(
                run_id=selected_run_id,
                artifact_path=self.config.mlflow_vectorizer_model_path,
                dst_path=self.config.root_dir
            )
        except Exception as download_error:
            logger.error(download_error)
            raise download_error
        
        try:
            logger.info("Load the vectorizer model.")
            vectorizer = joblib.load(vectorizer_model_path)
            
            logger.info("Transform the data.")
            X_test_vec = vectorizer.transform(data)
        except Exception as load_error:
            logger.error(load_error)
            raise load_error
        
        logger.info("Predict the data.")
        loaded_model = pyfunc.load_model(model_uri=selected_model.source)
        y_predict = loaded_model.predict(X_test_vec).tolist()
        
        return y_predict

### Predict the Data

**Debug**: test the predict object and it's method.

In [27]:
config = ConfigurationManager()
predict_config = config.get_prediction_config()

[2025-07-02 15:58:38,858: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-07-02 15:58:38,864: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2025-07-02 15:58:38,868: INFO: common: created directory at: artifacts]
[2025-07-02 15:58:38,869: INFO: common: created directory at: artifacts/predict]


In [28]:
X_test = joblib.load(predict_config.input_test_path)

In [29]:
request_body = {
    "reviewContents": X_test.to_list()
}

In [30]:
data = request_body["reviewContents"]
data[:10]

['barang sudah di terima dengan baik, dah di coba Oke.terima kasih Ladaza',
 'sangat puas trimakasih lazada',
 'D pasang ke SS J2 nggak bisa. Apa kartuny yg rusak?',
 'rekomendet bgt lahh barang bagus cpt bgt sampe y padahal tmpat aq desa plosok bgt tpi cuman 2hri dah sampe',
 'Film ato lagu2 saat d putar dri plashdisk error..smuanya filmku error gx bsa d putar😭😭😭',
 'bagus',
 'Toshiba 1 TB Hitam dengan banyak GRATIS Usb TOSHIBA 32GB + Pouch Harddisk & Usb OTG Reader Android NICE !!!!',
 'Produk original dan awet',
 'Barang bagus mulus wlau hnya paking buble warp, kulitas lumayan lh dgn harga sgitu.. Rekomended bgt ni brg..',
 'mantabb, barang bagus sesuai pesanan dan datangnya cepat 👍👍']

This code in `app.py`.

In [31]:
try:
    config = ConfigurationManager()
    predict_config = config.get_prediction_config()
    predict = Predict(config=predict_config)
    result = predict.run(data)
except Exception as e:
    logger.error(e)
    raise e

[2025-07-02 15:58:39,188: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-07-02 15:58:39,192: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2025-07-02 15:58:39,194: INFO: common: created directory at: artifacts]
[2025-07-02 15:58:39,196: INFO: common: created directory at: artifacts/predict]
[2025-07-02 15:58:39,199: INFO: 539828281: Set MLflow Client.]
[2025-07-02 15:58:39,202: INFO: 539828281: Select the deployed model from MLflow.]
[2025-07-02 15:58:39,242: INFO: 539828281: Get the deployed model run id.]
[2025-07-02 15:58:39,244: INFO: 539828281: Downloading vectorizer from MLflow's artifacts.]


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

[2025-07-02 15:58:39,393: INFO: 539828281: Load the vectorizer model.]
[2025-07-02 15:58:39,433: INFO: 539828281: Transform the data.]
[2025-07-02 15:58:40,643: INFO: 539828281: Predict the data.]


Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

In [32]:
len(result)

85624

In [33]:
result[:10]

[5, 5, 1, 5, 2, 5, 1, 5, 5, 5]