In [1]:
import os

In [2]:
%pwd

'/home/fachruzaini/lazada-id-reviews/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/home/fachruzaini/lazada-id-reviews'

### Data Preprocessing Config

This code will be apply in `src/LadazaIDReview/entity/config_entity.py`

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataDumpConfig:
    root_dir: Path
    reviews_path: Path
    input_train_path: Path
    input_test_path: Path
    output_train_path: Path
    output_test_path: Path
    params_test_size: float

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    vectorized_train_path: Path
    vectorized_test_path: Path
    model_dir: Path
    vectorizer_model_path: Path

### Data Preprocessing Config Manager

This code will be apply in `src/LazadaIDReview/config/configurations.py`.

In [6]:
from LazadaIDReviews.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LazadaIDReviews.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_dump_data_config(self) -> DataDumpConfig:
        """read data dump config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        ingest_config = self.config.ingest_from_sql
        dataset_params = self.params

        create_directories([dump_config.root_dir])

        config = DataDumpConfig(
            root_dir=dump_config.root_dir,
            reviews_path=ingest_config.reviews_path,
            input_train_path=dump_config.input_train_path,
            input_test_path=dump_config.input_test_path,
            output_train_path=dump_config.output_train_path,
            output_test_path=dump_config.output_test_path,
            params_test_size=dataset_params.TEST_SIZE
        )

        return config
    
    def get_preprocessing_data_config(self) -> DataPreprocessingConfig:
        """read preprocessing config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        dump_config = self.config.dump_data
        vectorize_config = self.config.vectorize_data
        train_config = self.config.train_model

        create_directories([vectorize_config.root_dir])

        config = DataPreprocessingConfig(
            root_dir=vectorize_config.root_dir,
            input_train_path=Path(dump_config.input_train_path),
            input_test_path=Path(dump_config.input_test_path),
            vectorized_train_path=Path(vectorize_config.vectorized_train_path),
            vectorized_test_path=Path(vectorize_config.vectorized_test_path),
            model_dir=train_config.root_dir,
            vectorizer_model_path=Path(vectorize_config.vectorizer_model_path)
        )

        return config

### Perform Preprocessing

This code in `src/LazadaIDReview/components/preprocessing.py`.

What we would do?
+ Drop null values
+ Splitting the dataset to train and test data
+ Vectorize text using `TFIDF`

As stated before; let’s load, select columns, and drop null values from dataset.

In [8]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler

from LazadaIDReviews import logger

class Preprocessing:
    def __init__(self, config: DataDumpConfig):
        self.config = config

    def dump_data(self) -> None:
        """dump the splited dataset to data training and testing
        """
        logger.info(f"Read reviews file.")
        dataset_reviews = pd.read_csv(self.config.reviews_path)
        dataset = dataset_reviews[["rating", "reviewContent"]].copy()
        dataset.dropna(inplace=True)
        
        logger.info(f"Split reviews file to data train and test.")
        X_train, X_test, y_train, y_test = train_test_split(
            dataset["reviewContent"], 
            dataset["rating"], 
            test_size=self.config.params_test_size
        )
        
        # NOTE: improve the performance with ROS
        logger.info(f"Perform random over sampler.")
        ros = RandomOverSampler()
        X_train_ros, y_train_ros = ros.fit_resample(pd.DataFrame(X_train), pd.DataFrame(y_train))
        
        # NOTE: data save as series
        logger.info(f"Dump data train into {self.config.root_dir} directory.")
        X_train_ros["reviewContent"].to_pickle(self.config.input_train_path)
        X_test.to_pickle(self.config.input_test_path)
        
        # NOTE: data save as series
        logger.info(f"Dump data test into {self.config.root_dir} directory.")
        y_train_ros["rating"].to_pickle(self.config.output_train_path)
        y_test.to_pickle(self.config.output_test_path)
        
    def vectorize_data(self) -> None:
        """vectorize the splited dataset and dump vectorizer model
        """
        vectorizer = TfidfVectorizer()
        
        logger.info(f"Load data train in {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        
        logger.info(f"Load data test in {self.config.input_test_path}.")
        X_test = joblib.load(self.config.input_test_path)
        
        logger.info(f"Vectorize the data.")
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)
        
        logger.info(f"Dump the vectorized data.")
        joblib.dump(X_train_vec, self.config.vectorized_train_path)
        joblib.dump(X_test_vec, self.config.vectorized_test_path)
        
        logger.info(f"Creating {self.config.model_dir} directory.")
        model_dir = str(self.config.model_dir)
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Save the vectorizer model.")
        joblib.dump(vectorizer, self.config.vectorizer_model_path)  

### Dump the Data Train and Data Test

This code in `src/LazadaIDReview/pipeline/step_02_preprocessing.py`.

In [9]:
try:
    config = ConfigurationManager()
    dump_data_config = config.get_dump_data_config()
    data_ingestion = Preprocessing(config=dump_data_config)
    data_ingestion.dump_data()
except Exception as e:
    logger.error(e)
    raise e

[2025-07-02 15:28:06,800: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-07-02 15:28:06,805: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2025-07-02 15:28:06,807: INFO: common: created directory at: artifacts]
[2025-07-02 15:28:06,810: INFO: common: created directory at: artifacts/data]
[2025-07-02 15:28:06,814: INFO: 2608593555: Read reviews file.]
[2025-07-02 15:28:07,450: INFO: 2608593555: Split reviews file to data train and test.]
[2025-07-02 15:28:07,463: INFO: 2608593555: Perform random over sampler.]
[2025-07-02 15:28:07,486: INFO: 2608593555: Dump data train into artifacts/data directory.]
[2025-07-02 15:28:07,548: INFO: 2608593555: Dump data test into artifacts/data directory.]


**Debug**: Read data

In [10]:
X_train = joblib.load(dump_data_config.input_train_path)
X_train

0        kenpa kalo di coba buat ngetik eror misalnya k...
1               dateng nya cepet banget terimakasih lazada
2                         barang kaga bisa dipakai...payah
3               produk memuaskan sampai cuma 1 hari mantap
4                                                    Bagus
                               ...                        
83440    Standard. Udah dipake 1 thn masih ok. Harganya...
83441    Barang sdh diterima dg baik meski smpt waswas ...
83442    MANTAP! Tapi mouse dan mouse pad legion tidak ...
83443    brg cpt sekali sampai nya, tp blm say cek , sm...
83444                          Barangnya masih ada ga mas?
Name: reviewContent, Length: 83445, dtype: object

In [11]:
X_train.isnull().sum()

0

In [12]:
y_train = joblib.load(dump_data_config.output_train_path)
y_train

0        2
1        5
2        1
3        5
4        5
        ..
83440    4
83441    4
83442    4
83443    4
83444    4
Name: rating, Length: 83445, dtype: int64

In [13]:
X_test = joblib.load(dump_data_config.input_test_path)
X_test

149466    tv bagus sesuai spesifikasi dan harga murah bg...
132999                                       mantul pak eko
177485    Gan cencel aja udah...udh sminggu nunggu masih...
171696                       barng mantul dan real capacity
139118    Pengiriman secepat kilat..gazz pool..blm di on...
                                ...                        
141195                           mantul... ori n ssuai spek
10663     Saya mau return barangnya karena tidak bisa di...
182871                                     keren,, thankyou
84901     seusai diskripsi semoga bermanfaat dan awet ti...
146926    sip...barang jos makasih lazada... pengiriman ...
Name: reviewContent, Length: 85624, dtype: object

In [14]:
X_test.isnull().sum()

0

In [15]:
y_test = joblib.load(dump_data_config.output_test_path)
y_test

149466    5
132999    5
177485    2
171696    5
139118    5
         ..
141195    5
10663     5
182871    5
84901     5
146926    5
Name: rating, Length: 85624, dtype: int64

### Vectorize the Data Train and Data Test

This code in `src/LazadaIDReview/pipeline/step_02_preprocessing.py`.

In [16]:
try:
    config = ConfigurationManager()
    preprocessing_config = config.get_preprocessing_data_config()
    preprocessing = Preprocessing(config=preprocessing_config)
    preprocessing.vectorize_data()
except Exception as e:
    logger.error(e)
    raise e

[2025-07-02 15:28:08,078: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-07-02 15:28:08,082: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2025-07-02 15:28:08,083: INFO: common: created directory at: artifacts]
[2025-07-02 15:28:08,085: INFO: common: created directory at: artifacts/preprocessing]
[2025-07-02 15:28:08,094: INFO: 2608593555: Load data train in artifacts/data/X_train.pkl.]


[2025-07-02 15:28:08,198: INFO: 2608593555: Load data test in artifacts/data/X_test.pkl.]
[2025-07-02 15:28:08,316: INFO: 2608593555: Vectorize the data.]
[2025-07-02 15:28:10,856: INFO: 2608593555: Dump the vectorized data.]
[2025-07-02 15:28:10,987: INFO: 2608593555: Creating artifacts/models directory.]
[2025-07-02 15:28:10,988: INFO: 2608593555: Save the vectorizer model.]


**Debug**: Read data

In [17]:
X_train_vec = joblib.load(preprocessing_config.vectorized_train_path)
X_train_vec

<83445x13560 sparse matrix of type '<class 'numpy.float64'>'
	with 1222962 stored elements in Compressed Sparse Row format>

In [18]:
X_test_vec = joblib.load(preprocessing_config.vectorized_test_path)
X_test_vec

<85624x13560 sparse matrix of type '<class 'numpy.float64'>'
	with 983649 stored elements in Compressed Sparse Row format>