In [None]:
import os

In [None]:
%pwd

In [None]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [None]:
%pwd

### Data Preprocessing Config

This code will be apply in `src/MarketplaceReviews/entity/config_entity.py`

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataDumpConfig:
    root_dir: Path
    reviews_path: Path
    input_train_path: Path
    input_test_path: Path
    output_train_path: Path
    output_test_path: Path
    params_test_size: float

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    vectorized_train_path: Path
    vectorized_test_path: Path
    model_dir: Path
    vectorizer_model_path: Path

### Data Preprocessing Config Manager

This code will be apply in `src/MarketplaceReviews/config/configurations.py`.

What we would do?
+ Drop null values
+ Splitting the dataset to train and test data
+ Vectorize text using `TFIDF`

As stated before; let’s load, select columns, and drop null values from dataset.

In [None]:
from MarketplaceReviews.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from MarketplaceReviews.utils.common import read_yaml, create_directories

In [None]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_dump_data_config(self) -> DataDumpConfig:
        """read data dump config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        data_ingest_config = self.config.ingest_from_sql
        data_dump_config = self.config.dump_data
        dataset_params = self.params

        create_directories([data_dump_config.root_dir])

        config = DataDumpConfig(
            root_dir=data_dump_config.root_dir,
            reviews_path=data_ingest_config.reviews_path,
            input_train_path=data_dump_config.input_train_path,
            input_test_path=data_dump_config.input_test_path,
            output_train_path=data_dump_config.output_train_path,
            output_test_path=data_dump_config.output_test_path,
            params_test_size=dataset_params.TEST_SIZE
        )

        return config
    
    def get_preprocessing_data_config(self) -> DataPreprocessingConfig:
        """read preprocessing config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        data_dump_config = self.config.dump_data
        vectorize_config = self.config.vectorize_data
        train_config = self.config.train_model

        create_directories([vectorize_config.root_dir])

        config = DataPreprocessingConfig(
            root_dir=vectorize_config.root_dir,
            input_train_path=Path(data_dump_config.input_train_path),
            input_test_path=Path(data_dump_config.input_test_path),
            vectorized_train_path=Path(vectorize_config.vectorized_train_path),
            vectorized_test_path=Path(vectorize_config.vectorized_test_path),
            model_dir=train_config.root_dir,
            vectorizer_model_path=Path(vectorize_config.vectorizer_model_path)
        )

        return config

### Perform Preprocessing

This code in `src/MarketplaceReviews/components/preprocessing.py`.

In [None]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from MarketplaceReviews import logger

class Preprocessing:
    def __init__(self, config: DataDumpConfig):
        self.config = config

    def dump_data(self) -> None:
        """dump the splited dataset to data training and testing
        """
        logger.info(f"Read reviews file.")
        dataset_reviews = pd.read_csv(self.config.reviews_path)
        dataset = dataset_reviews[["rating", "review"]].copy()
        dataset.dropna(inplace=True)
        
        logger.info(f"Split reviews file to data train and test.")
        X_train, X_test, y_train, y_test = train_test_split(
            dataset["review"], 
            dataset["rating"], 
            test_size=self.config.params_test_size
        )
        
        logger.info(f"Dump data train into {self.config.input_train_path} directory.")
        X_train.to_pickle(self.config.input_train_path)
        X_test.to_pickle(self.config.input_test_path)
        
        logger.info(f"Dump data test into {self.config.input_test_path} directory.")
        y_train.to_pickle(self.config.output_train_path)
        y_test.to_pickle(self.config.output_test_path)
        
    def vectorize_data(self) -> None:
        """vectorize the splited dataset and dump vectorizer model
        """
        vectorizer = TfidfVectorizer()
        
        logger.info(f"Load data train in {self.config.input_train_path}.")
        X_train = joblib.load(self.config.input_train_path)
        
        logger.info(f"Load data test in {self.config.input_test_path}.")
        X_test = joblib.load(self.config.input_test_path)
        
        logger.info(f"Vectorize the data.")
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)
        
        logger.info(f"Dump the vectorized data.")
        joblib.dump(X_train_vec, self.config.vectorized_train_path)
        joblib.dump(X_test_vec, self.config.vectorized_test_path)
        
        logger.info(f"Creating {self.config.model_dir} directory.")
        model_dir = str(self.config.model_dir)
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Save the vectorizer model.")
        joblib.dump(vectorizer, self.config.vectorizer_model_path)

### Dump the Data Train and Data Test

This code in `src/MarketplaceReviews/pipeline/step_02_preprocessing.py`.

In [None]:
try:
    config = ConfigurationManager()
    dump_data_config = config.get_dump_data_config()
    data_ingestion = Preprocessing(config=dump_data_config)
    data_ingestion.dump_data()
except Exception as e:
    logger.error(e)
    raise e

**Debug**: Read data

In [None]:
X_train = joblib.load(dump_data_config.input_train_path)
X_train

In [None]:
X_train.isnull().sum()

In [None]:
y_train = joblib.load(dump_data_config.output_train_path)
y_train

In [None]:
X_test = joblib.load(dump_data_config.input_test_path)
X_test

In [None]:
X_test.isnull().sum()

In [None]:
y_test = joblib.load(dump_data_config.output_test_path)
y_test

### Vectorize the Data Train and Data Test

This code in `src/MarketplaceReview/pipeline/step_02_preprocessing.py`.

In [None]:
try:
    config = ConfigurationManager()
    preprocessing_config = config.get_preprocessing_data_config()
    preprocessing = Preprocessing(config=preprocessing_config)
    preprocessing.vectorize_data()
except Exception as e:
    logger.error(e)
    raise e

**Debug**: Read data

In [None]:
X_train_vec = joblib.load(preprocessing_config.vectorized_train_path)
X_train_vec

In [None]:
X_test_vec = joblib.load(preprocessing_config.vectorized_test_path)
X_test_vec