In [14]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import pathlib
import os
import gc
import sys
import seaborn as sns
from dotenv import find_dotenv, load_dotenv

# Insert project folder into Python System
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.utils import get_data_frame

# Sample data for Speed in Project Development (Outside of the ML Pipeline)

In [17]:
""" LOAD DATA """
df = pd.DataFrame({})
for path in pathlib.Path("../data/raw/").glob("*.gz"):
    print(f"processing {path} ...")
    temp = get_data_frame(path)  # load .gz type data
    df = pd.concat([df, temp])

""" SAMPLE DATA """
df = df.groupby("overall").sample(10000, replace=True, random_state=42)
df.to_csv("../data/interim/sampled-raw-data.csv", index=False)

del df
gc.collect()

processing ..\data\raw\All_Beauty_5.json.gz ...
processing ..\data\raw\AMAZON_FASHION_5.json.gz ...
processing ..\data\raw\Appliances_5.json.gz ...
processing ..\data\raw\Arts_Crafts_and_Sewing_5.json.gz ...
processing ..\data\raw\Automotive_5.json.gz ...


# Data Ingestion

In [15]:
import logging
import pandas as pd

class Logger:
    def __init__(self, logger_name: str) -> None:
        log_fmt = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
        logging.basicConfig(level=logging.INFO, format=log_fmt)
        self.logger_name = logger_name

    def get_logger(self) -> logging.Logger:
        self.logger = logging.getLogger(self.logger_name)
        return self.logger

class DataIngestion:
    def __init__(self, data_path: str):
        self.data_path = data_path
        self.logger = Logger(__name__).get_logger()

    def get_data(self):
        self.logger.info(f"Ingesting Data from {self.data_path} ...")
        df = pd.read_csv(self.data_path)
        return df
df = DataIngestion("../data/interim/sampled-raw-data.csv").get_data()

2024-03-01 15:41:44,945 - __main__ - INFO - Ingesting Data from ../data/interim/sampled-raw-data.csv ...


# Data Cleaning

In [16]:
import pandas as pd
class DataCleaning:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.logger = Logger(__name__).get_logger()

    def clean_data(self):
        self.logger.info(f"Cleaning Data ...")
        self.df = self.df.drop_duplicates()  # drop duplicates
        self.df = self.df.dropna(subset=['reviewText'], axis=0)  # drop missing `reviewText` columns
        self.df = self.df[["reviewText", "overall"]]  # select columns
        self.df = self.df.reset_index(drop=True)  # reset index
        return self.df

df = DataIngestion("../data/interim/sampled-raw-data.csv").get_data()
df = DataCleaning(df).clean_data()

2024-03-01 15:41:45,994 - __main__ - INFO - Ingesting Data from ../data/interim/sampled-raw-data.csv ...
2024-03-01 15:41:46,305 - __main__ - INFO - Cleaning Data ...


# Data Preprocessing

In [17]:
import pandas as pd
class DataLabeling:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.logger = Logger(__name__).get_logger()

    def label_data(self):
        self.logger.info(f"Labeling Data ...")
        self.df['sentiment'] = self.df['overall'].apply(lambda x: 1 if x >= 3 else 0)  # convert overall to sentiment
        self.df = self.df.drop(columns=['overall'])
        return self.df

df = DataIngestion("../data/interim/sampled-raw-data.csv").get_data()
df = DataCleaning(df).clean_data()
df = DataLabeling(df).label_data()

2024-03-01 15:41:46,741 - __main__ - INFO - Ingesting Data from ../data/interim/sampled-raw-data.csv ...
2024-03-01 15:41:47,025 - __main__ - INFO - Cleaning Data ...
2024-03-01 15:41:47,121 - __main__ - INFO - Labeling Data ...


In [18]:
import pandas as pd
import string
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from typing_extensions import Annotated

class DataPreprocessing:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords_en = stopwords.words("english")
        self.punctuations = string.punctuation
        self.logger = Logger(__name__).get_logger()

    def preprocess_text(self, text: str) -> str:
        tokens = word_tokenize(text.lower())  # normalize, remove punctuations, and tokenize text
        filtered_tokens = [token for token in tokens if token not in self.stopwords_en and token not in self.punctuations]  # filter stop words
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in filtered_tokens]  # lemmatize words
        return " ".join(lemmatized_tokens)  # Join the tokens back into a string

    def preprocess_data(self) -> Annotated[pd.DataFrame, "dataset"]:
        self.logger.info(f"Preprocessing Data ...")
        self.df["preprocessed_review_text"] = self.df["reviewText"].apply(self.preprocess_text)  # text preprocessing
        self.df = self.df[(self.df["preprocessed_review_text"].apply(lambda x: len(x)) != 0)]  # remove 0 length preprocess text
        self.df = self.df[["preprocessed_review_text", "sentiment"]]  # select columns for model training
        return self.df

df = DataIngestion("../data/interim/sampled-raw-data.csv").get_data()
df = DataCleaning(df).clean_data()
df = DataLabeling(df).label_data()
df = DataPreprocessing(df).preprocess_data()
df.to_csv("../data/processed/sample-clean-data.csv", index=False)

2024-03-01 15:41:47,404 - __main__ - INFO - Ingesting Data from ../data/interim/sampled-raw-data.csv ...
2024-03-01 15:41:47,705 - __main__ - INFO - Cleaning Data ...
2024-03-01 15:41:47,823 - __main__ - INFO - Labeling Data ...
2024-03-01 15:41:47,857 - __main__ - INFO - Preprocessing Data ...
