# Text processing

## Environment setup

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
WORKING_PATH = '/content/drive/MyDrive/KeepCoding/NLP/exercise'


In [3]:
%cd {WORKING_PATH}

/content/drive/MyDrive/KeepCoding/NLP/exercise


In [None]:
!pip install -r requirements.txt

In [8]:
import sys
import pandas as pd
import numpy as np
import os
import re
from wordcloud import WordCloud, STOPWORDS
import unicodedata
from num2words import num2words
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
import pickle
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
import string

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# to load custom libraries
sys.path.append(WORKING_PATH)

# load custom libraries

### Read data

In [6]:
# Read data
data = pd.read_json('reviews_Patio_Lawn_and_Garden_5.json', lines=True)
data.shape

(13272, 9)

## Function for normalizing texts

This pipeline performs text normalization and lemmatization in preparation for natural language processing tasks. It begins by converting the input text to lowercase and expanding common negative contractions (e.g., "don't" becomes "do not"). Punctuation is isolated by surrounding it with spaces to improve tokenization. The text is then tokenized into individual words, and all instances of common negation words like "not" and "nor" are unified under the token "no" to preserve semantic consistency in downstream models and any numeric tokens are converted to their corresponding word form (e.g., "3" becomes "three"). Part-of-speech (POS) tags are assigned to each token and these tags are mapped to WordNet-compatible POS categories. Finally, the tokens are lemmatized (reduced to their base forms) using POS context, while stopwords and punctuation are filtered out, except for the token "no", which is retained due to its significance in sentiment and negation detection. The resulting list of normalized and lemmatized tokens is then returned for further analysis or modeling.

In [11]:
# Normalize texts
PUNCTUATION = set(string.punctuation)
STOP_WORDS = set(stopwords.words('english'))

def get_wordnet_pos(treebank_tag):
  """
  Convert POS tags to WordNet format
  """
  if treebank_tag.startswith('J'):
      return wordnet.ADJ
  elif treebank_tag.startswith('V'):
      return wordnet.VERB
  elif treebank_tag.startswith('N'):
      return wordnet.NOUN
  elif treebank_tag.startswith('R'):
      return wordnet.ADV
  else:
      return wordnet.NOUN  # Default to noun


def lemmatize_text(pos_tags):
  """
  Lemmatize words from POS-tagged tokens while filtering stopwords and punctuation
  """
  lemmatizer = WordNetLemmatizer()
  lemmatized = []
  for word, tag in pos_tags:
    if (word == 'no') or (word not in STOP_WORDS and word not in PUNCTUATION):
      pos = get_wordnet_pos(tag)
      lemma = lemmatizer.lemmatize(word, pos=pos)
      lemmatized.append(lemma)
  #return ' '.join(lemmatized)
  return lemmatized

def review2words(text):
  """
  Performs a complete text normalization pipeline including:
    1. Case normalization
    2. Contraction expansion (negations)
    3. Punctuation isolation
    4. Tokenization
    5. Number-to-word conversion
    6. POS-aware lemmatization
  """
  # text to lowercase
  text = text.lower()
  # Transform negative contractions (don't --> do not)
  text = re.sub(r"n't", " not", text)
  # Add spaces before and after punctuation marks.
  pattern = re.compile(f"[{re.escape(''.join(PUNCTUATION))}]")
  text = pattern.sub(f' {" "} ', text)
  # Tokenize
  tokens = word_tokenize(text)
  # Normalize negative words (n't, not, nor)
  tokens = list(map(lambda x: 'no' if x in ["n't", 'not', 'nor'] else x, tokens))
  # Transform digit to number
  tokens = list(map(lambda x: num2words(x, ordinal=False) if x.isdigit() else x, tokens))
  # POS tag
  pos_tags = pos_tag(tokens)
  # Lemmatize with POS context
  lematized = lemmatize_text(pos_tags)
  return lematized

def overall2label(overall):
  """
  Converts a numerical 'overall' rating into a binary label:
  - 0 for ratings below 4 (negative)
  - 1 for ratings 4 or above (positive)
  """
  label = None
  if overall < 4:
    label = 0
  else:
    label = 1
  return label

In [12]:
# Check processing
print(f'Before:\n{data.loc[201,"reviewText"]}')
print(f'After:\n{review2words(data.loc[201,"reviewText"])}')

Before:
This chain fit great for my Poulan Pro electric saw. This is also an older company that has a great website to help identify exactly what model your specific saw requires. It is not directional, so you don't have to worry about any arrows facing the right way. I just used it and I forgot how powerful my saw was. Getting old ones sharpened can be questionable when new ones are this cheap.
After:
['chain', 'fit', 'great', 'poulan', 'pro', 'electric', 'saw', 'also', 'old', 'company', 'great', 'website', 'help', 'identify', 'exactly', 'model', 'specific', 'saw', 'require', 'no', 'directional', 'no', 'worry', 'arrow', 'face', 'right', 'way', 'use', 'forget', 'powerful', 'saw', 'get', 'old', 'one', 'sharpen', 'questionable', 'new', 'one', 'cheap']


## Preprocess data

In this step, the data is prepared to be directly analyzed by a model. First, the target variable is created from the 'overall' variable. Then, the dataset is split into train and test sets, and text normalization is applied to both. The result is saved for later use.

In [17]:
cache_dir = "cache"
os.makedirs(cache_dir, exist_ok=True)

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """
    Preprocesses training and test data by:
    1. Converting reviews to tokenized words
    2. Caching/loading processed data for efficiency
    """

    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass

    if cache_data is None:
        words_train = list(map(review2words, data_train))
        words_test = list(map(review2words, data_test))

        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])

    return words_train, words_test, labels_train, labels_test

In [28]:
# Create 'label' feature
data['label'] = list(map(overall2label, data['overall']))

# Split into train and test datasets
reviewText_train, reviewText_test, overall_train, overall_test = train_test_split(
    data['reviewText'], data['label'], train_size=0.75, test_size=0.25,
    random_state=42, shuffle=True, stratify=data['label'])

words_train, words_test, labels_train, labels_test = preprocess_data(reviewText_train, reviewText_test, overall_train, overall_test, cache_dir)

Wrote preprocessed data to cache file: preprocessed_data.pkl
