### Preprocessing Goals

- this notebook builds a robust text preprocessing pipeline for sentiment classification.
The output is a clean text column `normalized_review` optimized for TF-IDF and linear models (in this project: Logistic Regression).

- Input column: `review_body`
- Output column: `normalized_review`

- The pipeline keeps sentiment-relevant signals (for ex: negation and strong punctuation) and removes noisy artifacts.



In [1]:
import re
import spacy
import contractions

from spacy.lang.en.stop_words import STOP_WORDS
from typing import Iterable


In [2]:
nlp = spacy.load("en_core_web_md", disable=["ner", "textcat"])

In [3]:
def load_nlp_ner():
    try:
        return spacy.load("en_core_web_md", disable=["parser", "textcat"])
    except Exception:
        return None


In [4]:
nlp_ner = load_nlp_ner()

In [None]:
import pandas as pd

file_path = "../data/raw/amazon_reviews_us_Digital_Software_v1_00.tsv"

NROWS = 10000
loaded_data = pd.read_csv(file_path, sep="\t", encoding="utf-8", nrows=NROWS)

In [6]:
data = loaded_data["review_body"].fillna("")

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data.shape

---

#### Regular Expression Patterns

In [9]:
_REGEX_URL = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)  # case sensitive=False
_REGEX_EMAIL = re.compile(r"\b[\w.\-+]+@[\w.\-]+\.\w+\b")
_REGEX_USER = re.compile(r"@[A-Za-z0-9_]+")
_REGEX_HASHTAG = re.compile(r"#([A-Za-z0-9_]+)")
_REGEX_HTML_TAG = re.compile(r"<[^>]+>")
_REGEX_CONTROL = re.compile(r"[\r\n\t]+")
_REGEX_MULTI_WS = re.compile(r"\s+")
_REGEX_REPEAT_CHAR = re.compile(r"([A-Za-z])\1{2,}")
_REGEX_DOTS = re.compile(r"\.{3,}")
_RE_INT = re.compile(r"^\d+$")


#### Special Tokens

These tokens are intentionally preserved because they carry sentiment information.

The pipeline uses structured marker tokens:

In [10]:
_SPECIAL_TOKENS = {
    "tok_url", "tok_email", "tok_user", "tok_hashtag",
    "tok_num", "tok_num_1", "tok_num_2", "tok_num_3", "tok_num_4", "tok_num_5",
    "tok_excl", "tok_excl_multi",
    "tok_q", "tok_q_multi",
    "tok_qex",
    "tok_org"
}


In [11]:
_NEGATIONS = {
    "not", "no", "never", "none", "cannot", "can't", "dont", "nothing", "neither"
}

In [12]:
_EXCEPTIONS = {
    "i", "you", "we", "it", "they"
}

In [13]:
_SCOPE_BREAKERS = {"tok_excl", "tok_excl_multi", "tok_q", "tok_q_multi", "tok_qex"}

In [14]:
SCOPE_BREAKERS = {
    "tok_excl", "tok_excl_multi",
    "tok_q", "tok_q_multi",
    "tok_qex",
    "but", "however", "though"
}

---

#### NER

company names are replaced with `tok_org`.

In [15]:
def mask_org_entities_batch(texts, nlp_ner, batch_size=1024):
    if nlp_ner is None:
        return list(texts)

    out = []
    docs = nlp_ner.pipe(texts, batch_size=batch_size)

    for text, doc in zip(texts, docs):
        if not doc.ents:
            out.append(text)
            continue

        parts, last = [], 0
        for ent in doc.ents:
            if ent.label_ != "ORG":
                continue
            parts.append(text[last:ent.start_char])
            parts.append(" tok_org ")
            last = ent.end_char
        parts.append(text[last:])
        out.append("".join(parts))

    return out


#### Convert contractions

expands common English contractions into their full forms.

In [16]:
def expand_contractions(text: str) -> str:
    return contractions.fix(text)

#### Text normalization

URL, email, mentions, hashtags, and punctuation markers are normalized to `tok_*` tokens.

In [None]:
def normalize_text(text: str) -> str:
    if text is None:
        return ''
    if not isinstance(text, str):
        text = str(text)

    text = expand_contractions(text)

    text = _REGEX_HTML_TAG.sub(" ", text)
    text = _REGEX_URL.sub(" tok_url ", text)
    text = _REGEX_EMAIL.sub(" tok_email ", text)
    text = _REGEX_USER.sub(" tok_user ", text)
    text = _REGEX_HASHTAG.sub(r" tok_hashtag \1 ", text)

    text = _REGEX_CONTROL.sub(" ", text)
    text = _REGEX_DOTS.sub(" ", text)
    text = _REGEX_REPEAT_CHAR.sub(r"\1\1", text)
    text = _REGEX_CONTROL.sub(" ", text)

    # keep sentiment punctuation as explicit markers before dropping punctuation tokens
    text = re.sub(r"\?\!|\!\?", " tok_qex ", text)
    text = re.sub(r"\!{2,}", " tok_excl_multi ", text)
    text = re.sub(r"\?{2,}", " tok_q_multi ", text)
    text = re.sub(r"\!", " tok_excl ", text)
    text = re.sub(r"\?", " tok_q ", text)
    
    text = text.lower()
    text = _REGEX_MULTI_WS.sub(" ", text).strip()
    return text

In [18]:

normalized_reviews = data.astype(str).to_list()
loaded_data["normalized_review"] = [normalize_text(x) for x in normalized_reviews]


In [None]:
loaded_data[["review_body", "normalized_review"]].head(10)

In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 10)

In [None]:
loaded_data[["review_body", "normalized_review"]].tail(10)

In [None]:
loaded_data[["review_body", "normalized_review"]].iloc[10:31]

In [None]:
loaded_data[["review_body", "normalized_review"]].sample(20)

#### Map sentiment numbers

maps numbers in the token that are relevant to sentiment analysis.

In [24]:
def map_sentiment_number(token) -> str:
    raw = token.text.strip()
    if _RE_INT.match(raw):
        value = int(raw)
        if 1 <= value <= 5:
            return f"tok_num_{value}"
        return "tok_num"
    if token.like_num:
        return "tok_num"
    return ""

#### Token normalization

lemmatization + numeric mapping (`tok_num`, `tok_num_1-5`) + optional stopword removal.

In [25]:

def normalize_token(token, use_lemma: bool = True, remove_stopwords: bool = False) -> str:
    raw = token.text.strip()
    if not raw:
        return ""

    if raw in _SPECIAL_TOKENS:
        return raw
    
    num_token = map_sentiment_number(token)
    if num_token:
        return num_token
    
    if token.is_punct or token.is_space:
        return ""

    normalized = token.lemma_.lower() if (use_lemma and token.lemma_ and token.lemma_ != "-PRON-") else token.lower_
    normalized = normalized.strip("`'\".,;:()[]{}<>|\\/")

    if not normalized:
        return ""
    if len(normalized) < 2 and normalized not in _EXCEPTIONS:
        return ""
    if remove_stopwords and normalized in STOP_WORDS and normalized not in _NEGATIONS:
        return ""

    return normalized


In [None]:

docs = nlp.pipe(
    loaded_data["normalized_review"].fillna("").astype(str).tolist(),
    batch_size=1024
)

normalized_reviews = [
    " ".join(tok for tok in (normalize_token(t) for t in doc) if tok)
    for doc in docs
]


In [None]:

loaded_data["normalized_review"] = normalized_reviews
loaded_data[["review_body", "normalized_review"]].head(20)


In [None]:
loaded_data[["review_body", "normalized_review"]].sample(20)  # random

In [None]:

docs = nlp.pipe(
    loaded_data["normalized_review"].fillna("").astype(str).tolist(),
    batch_size=1024
)

normalized_reviews = [
    " ".join(
        tok for tok in (normalize_token(t) for t in doc) if tok
    )
    for doc in docs
]


In [None]:
loaded_data["normalized_review"] = normalized_reviews
loaded_data[["review_body", "normalized_review"]].head(20)

#### Negation scope

handling: tokens following negation words are augmented with `neg_` features.

In [37]:

def apply_negation_scope(tokens: list[str], window: int = 3) -> list[str]:
    out: list[str] = []
    n = len(tokens)
    i = 0
    while i < n:
        tok = tokens[i]
        out.append(tok)

        if tok in _NEGATIONS:
            applied = 0
            j = i + 1
            while j < n and applied < window:
                nxt = tokens[j]
                if nxt in _SCOPE_BREAKERS:
                    break
                if nxt and not nxt.startswith("tok_") and nxt not in _NEGATIONS:
                    out.append(f"neg_{nxt}")
                    applied += 1
                j += 1
        i += 1
    return out


#### Core Preprocessing Functions

**`preprocess_one`**

Preprocesses a single text sample end-to-end.
It applies text normalization, token normalization, and negation-scope expansion.
Useful for debugging, unit checks, and single-input inference tests.

**`preprocess_series`**

Preprocesses a collection of texts in batch mode.
It runs ORG masking with `nlp_ner.pipe`, then normalization and token processing with `nlp.pipe`.
Designed for efficient large-scale processing and consistent output formatting.

**`preprocess_dataframe`**

Preprocesses a full DataFrame and writes the cleaned output to a target column.
It validates input columns, runs the batched preprocessing pipeline, and returns a copied DataFrame.
This is the main entry point used in this notebook for final dataset generation.


In [38]:
def preprocess_one(text: str, nlp, use_lemma: bool = True, remove_stopwords: bool = False, negation_window: int = 3) -> str:
    clean = normalize_text(text)
    if not clean:
        return ""
    doc = nlp(clean)
    tokens = [normalize_token(tok, use_lemma=use_lemma, remove_stopwords=remove_stopwords) for tok in doc]
    tokens = [t for t in tokens if t]
    tokens = apply_negation_scope(tokens, window=negation_window)
    return " ".join(tokens)


def preprocess_series(
    texts: Iterable[str],
    nlp,
    nlp_ner=None,
    batch_size: int = 1024,
    use_lemma: bool = True,
    remove_stopwords: bool = False,
    negation_window: int = 3,
) -> list[str]:
    
    raw_texts = [str(t) for t in texts]

    # 1) NER masking (org)
    masked_texts = mask_org_entities_batch(raw_texts, nlp_ner, batch_size=batch_size)

    # 2) regex normalization
    normalized_texts = (normalize_text(t) for t in masked_texts)

    # 3) token pipeline
    docs = nlp.pipe(normalized_texts, batch_size=batch_size)


    out: list[str] = []
    for doc in docs:
        tokens = [normalize_token(tok, use_lemma=use_lemma, remove_stopwords=remove_stopwords) for tok in doc]
        tokens = [t for t in tokens if t]
        tokens = apply_negation_scope(tokens, window=negation_window)
        out.append(" ".join(tokens))
    return out

In [39]:

def preprocess_dataframe(
    df: pd.DataFrame,
    text_col: str = "review_body",
    output_col: str = "normalized_review",
    use_lemma: bool = True,
    remove_stopwords: bool = False,
    negation_window: int = 3,
    batch_size: int = 1024,
) -> pd.DataFrame:
    if text_col not in df.columns:
        raise ValueError(f"Column '{text_col}' not found.")

    processed = preprocess_series(
        df[text_col].fillna("").astype(str).values,
        nlp=nlp,
        nlp_ner=nlp_ner,
        batch_size=batch_size,
        use_lemma=use_lemma,
        remove_stopwords=remove_stopwords,
        negation_window=negation_window,
    )
    out_df = df.copy()
    out_df[output_col] = processed
    return out_df


processed text is saved to normalized_review.

In [None]:
out_df = preprocess_dataframe(
    df=loaded_data,
    text_col="review_body",
    output_col="normalized_review",
    use_lemma=True,
    remove_stopwords=True,
    negation_window=3,
    batch_size=1024
)

loaded_data = out_df
loaded_data[["review_body", "normalized_review"]].head(20)


In [None]:
loaded_data[["review_body", "normalized_review"]].sample(20)