In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import pathlib
import os
import gc
import sys
import seaborn as sns
from dotenv import find_dotenv, load_dotenv

# Insert project folder into Python System
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))

# Data Ingestion

In [4]:
import os
import sys
import pandas as pd
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

# Insert project folder into Python System
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.utils import logger

class DataIngester:
    def __init__(self, data_path: str):
        self.data_path = data_path
        
    def run(self) -> pd.DataFrame:
        logger.info(f"Ingest {self.data_path}")
        df = pd.read_csv(self.data_path)
        return df

df = DataIngester("../data/raw/sampled_dataset.csv").run()

2024-03-06 14:13:36,604 - sentiment-classifier-logger - INFO - Ingest ../data/raw/sampled_dataset.csv


# Data Cleaning

In [5]:
import os
import sys
import pandas as pd
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

# Insert project folder into Python System
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.utils import logger

class DataCleaner:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def run(self) -> pd.DataFrame:
        logger.info(f"Cleaning Data ...")
        self.df = self.df.drop_duplicates()  # drop duplicates
        self.df = self.df.dropna(subset=['reviewText'], axis=0)  # drop missing `reviewText` columns
        self.df = self.df[["reviewText", "overall"]]  # select columns
        self.df = self.df.reset_index(drop=True)  # reset index
        return self.df

df = DataIngester("../data/raw/sampled_dataset.csv").run()
df = DataCleaner(df).run()

2024-03-06 14:13:37,158 - sentiment-classifier-logger - INFO - Ingest ../data/raw/sampled_dataset.csv
2024-03-06 14:13:37,338 - sentiment-classifier-logger - INFO - Cleaning Data ...


# Data Preprocessing

In [6]:
import os
import sys
import pandas as pd
from dotenv import find_dotenv, load_dotenv

# Insert project folder into Python System
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.utils import logger

class DataLabeler:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def run(self) -> pd.DataFrame:
        logger.info(f"Labeling Data ...")
        self.df['sentiment'] = self.df['overall'].apply(lambda x: 1 if x >= 3 else 0)  # convert overall to sentiment
        self.df = self.df.drop(columns=['overall'])
        return self.df

df = DataIngester("../data/raw/sampled_dataset.csv").run()
df = DataCleaner(df).run()
df = DataLabeler(df).run()

2024-03-06 14:13:40,046 - sentiment-classifier-logger - INFO - Ingest ../data/raw/sampled_dataset.csv
2024-03-06 14:13:40,176 - sentiment-classifier-logger - INFO - Cleaning Data ...
2024-03-06 14:13:40,226 - sentiment-classifier-logger - INFO - Labeling Data ...


In [9]:
import os
import sys
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from typing_extensions import Annotated

# Insert project folder into Python System
load_dotenv(find_dotenv())
sys.path.append(os.getenv("PROJECT_FOLDER"))
from src.utils import logger

class DataPreprocessor:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords_en = stopwords.words("english")
        self.punctuations = string.punctuation

    def preprocess_text(self, text: str) -> str:
        tokens = word_tokenize(text.lower())  # normalize, remove punctuations, and tokenize text
        filtered_tokens = [token for token in tokens if token not in self.stopwords_en and token not in self.punctuations]  # filter stop words
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in filtered_tokens]  # lemmatize words
        return " ".join(lemmatized_tokens)  # Join the tokens back into a string

    def run(self) -> pd.DataFrame:
        logger.info(f"Preprocessing Data ...")
        self.df["preprocessed_review_text"] = self.df["reviewText"].apply(self.preprocess_text)  # text preprocessing
        self.df = self.df[(self.df["preprocessed_review_text"].apply(lambda x: len(x)) != 0)]  # remove 0 length preprocess text
        self.df = self.df[["preprocessed_review_text", "sentiment"]]  # select columns for model training
        return self.df

df = DataIngester("../data/raw/sampled_dataset.csv").run()
df = DataCleaner(df).run()
df = DataLabeler(df).run()
df = DataPreprocessor(df).run()

2024-03-06 14:14:16,080 - sentiment-classifier-logger - INFO - Ingest ../data/raw/sampled_dataset.csv
2024-03-06 14:14:16,254 - sentiment-classifier-logger - INFO - Cleaning Data ...
2024-03-06 14:14:16,315 - sentiment-classifier-logger - INFO - Labeling Data ...
2024-03-06 14:14:16,329 - sentiment-classifier-logger - INFO - Preprocessing Data ...


In [11]:
os.chdir("..")
%pwd

'd:\\Documents\\GitHub\\customer-product-reviews-sentiment-classifier'