In [33]:
%reload_ext autoreload
%autoreload 2
import os
import sys
from dotenv import load_dotenv, find_dotenv
from dataclasses import dataclass
from pathlib import Path
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.utils.common import logger, read_yaml, create_directories

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    source_path: Path
    cleaned_data_path: Path
    transformed_data_path: Path
    train_data_path: Path
    test_data_path: Path

In [3]:
# os.chdir("..")

In [34]:
from src.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath: str = os.getenv("CONFIG_FILE_PATH"),
        params_filepath: str = os.getenv("PARAMS_FILE_PATH"),
        schema_filepath: str = os.getenv("SCHEMA_FILE_PATH"),
    ):
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        self.schema = read_yaml(Path(schema_filepath))
        
        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self):
        config = self.config.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            source_path=config.source_path,
            cleaned_data_path=config.cleaned_data_path,
            transformed_data_path=config.transformed_data_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
        )
        
        return data_transformation_config

In [36]:
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        """
        Instantiate `DataTransformation` class

        Args:
            config (DataTransformationConfig): configuration for data ingestion
        """
        self.stopwords_en = stopwords.words("english")
        self.punctuations = string.punctuation
        self.lemmatizer = WordNetLemmatizer()
        self.config = config
    
    def clean_data(self):
        logger.info(f"Clean Data")
        df = pd.read_csv(self.config.source_path)
        df = df.drop_duplicates()  # drop duplicates
        df = df.dropna(subset=['reviewText'], axis=0)  # drop missing `reviewText` columns
        df = df[["reviewText", "sentiment"]]  # select columns
        df = df.reset_index(drop=True)  # reset index
        df.to_csv(self.config.cleaned_data_path, index=False)

    def preprocess_text(self, text: str) -> str:
        tokens = word_tokenize(text.lower())  # normalize, remove punctuations, and tokenize text
        filtered_tokens = [token for token in tokens if token not in self.stopwords_en and token not in self.punctuations]  # filter stop words
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in filtered_tokens]  # lemmatize words
        return " ".join(lemmatized_tokens)  # Join the tokens back into a string
    
    def preprocess_texts(self):    
        df = pd.read_csv(self.config.cleaned_data_path)
        logger.info(f"Preprocess text data")
        df["preprocessed_review_text"] = df["reviewText"].apply(self.preprocess_text)  # text preprocessing
        df = df[(df["preprocessed_review_text"].apply(lambda x: len(x)) != 0)]  # remove 0 length preprocess text
        df = df[["preprocessed_review_text", "sentiment"]]  # select columns for model training
        df.to_csv(self.config.transformed_data_path, index=False)
    
    def split_data(self):    
        df = pd.read_csv(self.config.transformed_data_path)
        logger.info(f"Split data")
        X, y = df[['preprocessed_review_text']], df[['sentiment']]
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, shuffle=True, random_state=42)
        
        train = pd.concat([X_train, y_train], axis=1)
        test = pd.concat([X_test, y_test], axis=1)
        
        train.to_csv(self.config.train_data_path, index=False)
        test.to_csv(self.config.test_data_path, index=False)

In [37]:
try:
    configuration_manager = ConfigurationManager()
    data_transformation = DataTransformation(config=configuration_manager.get_data_transformation_config())
    data_transformation.clean_data()
    data_transformation.preprocess_texts()
    data_transformation.split_data()
except Exception as e:
    logger.error(e)


2024-03-07 22:03:29,235 - sentiment-classifier-logger - INFO - yaml file: C:\Users\USER\Documents\GitHub\customer-product-reviews-sentiment-classifier\config\config.yaml loaded successfully
2024-03-07 22:03:29,235 - sentiment-classifier-logger - INFO - yaml file: C:\Users\USER\Documents\GitHub\customer-product-reviews-sentiment-classifier\params.yaml loaded successfully
2024-03-07 22:03:29,243 - sentiment-classifier-logger - INFO - yaml file: C:\Users\USER\Documents\GitHub\customer-product-reviews-sentiment-classifier\schema.yaml loaded successfully
2024-03-07 22:03:29,244 - sentiment-classifier-logger - INFO - Created directory at: artifacts
2024-03-07 22:03:29,246 - sentiment-classifier-logger - INFO - Created directory at: artifacts/data_transformation
2024-03-07 22:03:29,248 - sentiment-classifier-logger - INFO - Clean Data
2024-03-07 22:03:29,630 - sentiment-classifier-logger - INFO - Preprocess text data
2024-03-07 22:03:45,679 - sentiment-classifier-logger - INFO - Split data
