# Text Normalization

by Michael Hunziker

## Summary
In this notebook we tokenize, stemm and lemmanize our data in order to prepare it as training data for a downstream nlp task.
Then we visualize the data in order to decide upon reasonable stopwords and identify malformed and outlier data that should be removed.
And we optimize for tweets.


*   Tokenize, stemm and lemmanize our data
*   Identify reasonable stopwords
*   Optimize for tweets
*   Save the normalized version

</br>

<a href="https://colab.research.google.com/github/miam-bonbon/assignment-adv-nlp/blob/main/adv_nlp_assignment_mh_02_text_normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture

!pip install 'fhnw-nlp-utils>=0.8.0,<0.9.0'

from fhnw.nlp.utils.processing import parallelize_dataframe
from fhnw.nlp.utils.processing import is_iterable
from fhnw.nlp.utils.storage import download
from fhnw.nlp.utils.storage import save_dataframe
from fhnw.nlp.utils.storage import load_dataframe
from fhnw.nlp.utils.text import join_tokens

import pandas as pd
import numpy as np

In [None]:
from fhnw.nlp.utils.system import set_log_level
from fhnw.nlp.utils.system import system_info

set_log_level()
print(system_info())

OS name: posix
Platform name: Linux
Platform release: 6.1.85+
Python version: 3.10.12
CPU brand: AMD EPYC 7B12
CPU cores: 4
RAM: 50.99GB total and 49.58GB available
Tensorflow version: 2.17.1
GPU is NOT AVAILABLE


In [None]:
# create a DEV variable to use later
DEV = False

In [None]:
%%time

!rm "./data/COVIDSenti_cleaned.parq"

download("https://github.com/miam-bonbon/assignment-adv-nlp/raw/refs/heads/main/data/COVIDSenti_cleaned.parq", "data/COVIDSenti_cleaned.parq")
data = load_dataframe("data/COVIDSenti_cleaned.parq")
data.shape

rm: cannot remove './data/COVIDSenti_cleaned.parq': No such file or directory
CPU times: user 452 ms, sys: 62.5 ms, total: 515 ms
Wall time: 2.46 s


(22516, 4)

In [None]:
data.head(3)

Unnamed: 0,tweet,label,cleaned_tweet,lang
2,"TLDR: Not SARS, possibly new coronavirus. Diff...",neg,"TLDR: Not SARS, possibly new coronavirus. Diff...",en
8,@tezuma75 Why #CCP keep on saying unknown caus...,neg,Why #CCP keep on saying unknown cause of pneu...,en
11,I always feel weird hoping for another coronav...,neg,I always feel weird hoping for another coronav...,en


Define our normalization functions (could we use the ones from the library? We'll check later)

In [None]:
def _tokenize(text, stopwords):
    """Tokenizes and lowercases a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens

    Returns
    -------
    list
        The tokenized text
    """
    from fhnw.nlp.utils.processing import is_iterable

    if isinstance(text, str):
        from nltk.tokenize import word_tokenize
        word_tokens = word_tokenize(text)
    elif is_iterable(text):
        word_tokens = text
    else:
        raise TypeError("Only string or iterable (e.g. list) is supported. Received a "+ str(type(text)))

    return [word.lower() for word in word_tokens if word.lower() not in stopwords]


def _tokenize_stem(text, stopwords, stemmer):
    """Tokenizes, lowercases and stems a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens
    stemmer: stemmer
        The stemmer to use (e.g. SnowballStemmer)

    Returns
    -------
    list
        The tokenized and stemmed text
    """
    from fhnw.nlp.utils.processing import is_iterable

    if isinstance(text, str):
        from nltk.tokenize import word_tokenize
        word_tokens = word_tokenize(text)
    elif is_iterable(text):
        word_tokens = text
    else:
        raise TypeError("Only string or iterable (e.g. list) is supported. Received a "+ str(type(text)))

    # if not isinstance(stemmer, Callable):
    stemmer = stemmer.stem

    return [stemmer(word.lower()) for word in word_tokens if word.lower() not in stopwords]


def _tokenize_lemma(text, stopwords, lemmanizer, keep_ners=False):
    """Tokenizes, lowercases and lemmatizes a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens
    lemmanizer: spacy nlp pipeline
        The lemmanizer to use (must be spacy nlp pipeline)
    keep_ner: bool
        Defines if named entities (NERs) should be keept in one token

    Returns
    -------
    list
        The tokenized and lemmatized text
    """
    from fhnw.nlp.utils.processing import is_iterable

    if isinstance(text, str):
        text = text
    elif is_iterable(text):
        from fhnw.nlp.utils.text import join_tokens
        text = join_tokens(text, set())
    else:
        raise TypeError("Only string or iterable (e.g. list) is supported. Received a "+ str(type(text)))

    if keep_ners:
        # HanoverTagger could be an alternative but takes longer
        # see: https://textmining.wp.hs-hannover.de/Preprocessing.html#Lemmatisierung-und-Wortarterkennung
        doc = lemmanizer(text, disable=['tagger', 'parser'])

        tokens = list()
        ner_idx = 0
        tok_idx = 0

        # keep ner in one token
        while tok_idx < len(doc):
            if ner_idx >= len(doc.ents) or doc[tok_idx].idx < doc.ents[ner_idx].start_char:
                if doc[tok_idx].is_alpha and not doc[tok_idx].is_punct and doc[tok_idx].text.lower() not in stopwords and doc[tok_idx].lemma_.lower() not in stopwords:
                    #print("token ", doc[tok_idx].lemma_.lower())
                    tokens.append(doc[tok_idx].lemma_.lower())

                tok_idx += 1
            else:
                #print("ner ", doc.ents[ner_idx].lemma_.lower())
                tokens.append(doc.ents[ner_idx].lemma_.lower())

                tok_idx += 1
                while tok_idx < len(doc) and doc[tok_idx].idx < doc.ents[ner_idx].end_char:
                    tok_idx += 1

                ner_idx += 1

        return tokens
    else:
        doc = lemmanizer(text, disable=['tagger', 'parser', 'ner'])
        return [tok.lemma_.lower() for tok in doc if tok.is_alpha and not tok.is_punct and tok.text.lower() not in stopwords and tok.lemma_.lower() not in stopwords]


def _normalize(text, stopwords, stemmer=None, lemmanizer=None, lemma_with_ner=False):
    """Normalizes (e.g. tokenize and stem) and lowercases a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens
    stemmer: stemmer
        The stemmer to use (e.g. SnowballStemmer) or None to disable stemming
    lemmanizer: spacy nlp pipeline
        The lemmanizer to use (must be spacy nlp pipeline) or None to disable lemmantization
    lemma_with_ner: bool
        Defines if named entities (NERs) should be keept in one token

    Returns
    -------
    list
        The normalized text
    """

    if lemmanizer is not None:
        return _tokenize_lemma(text, stopwords, lemmanizer, keep_ners=lemma_with_ner)
    elif stemmer is not None:
        return _tokenize_stem(text, stopwords, stemmer)
    else:
        return _tokenize(text, stopwords)

Let's use a spacy model (https://spacy.io/models/en#en_core_web_md)

In [None]:
%%capture

!pip install 'spacy>=3.0.5'
!pip install nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import spacy
!python3 -m spacy download en_core_web_md

nlp = spacy.load("en_core_web_md")

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

stemmer = SnowballStemmer("english")
stopwords = set(stopwords.words("english"))
#stopwords = set(nlp.Defaults.stop_words)
empty_stopwords = set()

In [None]:
print(stopwords)

{'by', 'both', 'wouldn', 't', 'some', 'not', 'the', 're', 'won', "shan't", 'to', 'below', 'any', "weren't", 'more', 'should', 'under', 'd', 'weren', 'itself', 'be', "hadn't", 'hers', 'when', "haven't", 'into', 'being', 'further', "wouldn't", 'doesn', 'no', 'than', 'just', 'against', 'yourselves', 'you', 'him', 'having', 'after', 'yourself', 'whom', 'isn', 'will', "needn't", 'a', 'can', 'most', 'don', 'what', "doesn't", 'it', 'them', 'there', 'as', 'while', "that'll", 'shan', 'because', 's', 'so', "it's", 'at', 'then', 'nor', 'down', 'out', "couldn't", 'same', 'didn', 'have', 'had', 'of', 'ours', 'am', 'very', 'each', 'wasn', 'she', 'hasn', 'do', 'but', 'ourselves', "you're", "aren't", "you'll", 'y', 'themselves', 'their', 'once', 'doing', "shouldn't", "you've", 'between', "should've", 'only', 'ma', 'that', 'over', 'needn', 'me', 'these', 'm', 'll', 'for', "you'd", 'they', 'its', 'o', 'own', 'again', "mustn't", 'on', 'now', 'how', 'our', "don't", 'hadn', 'haven', "hasn't", 'or', 'until'

Let's test some examples:

In [None]:
print(_tokenize("Coronavirus is that when youve had too many #Coronas ", stopwords=stopwords))
print(_tokenize_stem("The Frieman Scary Scale makes its debut. Wuhan Coronavirus scores  out of  N masks.", stopwords=stopwords, stemmer=stemmer))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
%%time
data = data[data.columns.drop(list(data.filter(regex='token_clean')))] # we always append
data = parallelize_dataframe(data, _normalize, field_read="cleaned_tweet", field_write="token_clean", stopwords=stopwords, stemmer=None, lemmanizer=None, lemma_with_ner=False)

Now let's use the library

In [None]:
%%time

from fhnw.nlp.utils.normalize import normalize, tokenize, tokenize_lemma, tokenize_stem

data = data[data.columns.drop(list(data.filter(regex='token_clean')))] # we always append
data = parallelize_dataframe(data, normalize, field_read="cleaned_tweet", field_write="token_clean", stopwords=stopwords, stemmer=None, lemmanizer=None, lemma_with_ner=False)

yep, also working

Get the (lowercased) tokens first without removing stopwords if we have to come up with a reasonable set of stopwords.

In [None]:
%%time
data = parallelize_dataframe(data, normalize, field_read="cleaned_tweet", field_write="token_no_stopwords_clean", stopwords=empty_stopwords, stemmer=None, lemmanizer=None, lemma_with_ner=False)

We also later need text instead of tokens

In [None]:
%%time
data = parallelize_dataframe(data, join_tokens, field_read="token_clean", field_write="text_clean", stopwords=empty_stopwords)

In [None]:
data.head(3)

Store the work so far (so we could resume if needed).

In [None]:
%%time

import pandas as pd
from google.colab import drive

if (DEV):
  # Mount Google Drive
  drive.mount('/content/drive')
  output_file_path = "/content/drive/MyDrive/COVIDSenti_tokenized_01.parq"  # save to github

  # Save the DataFrame to Parquet format
  data.to_parquet(output_file_path)

  save_dataframe(data, "data/COVIDSenti_tokenized_01.parq")

Here we reload if we work from here

In [None]:
if 'data' not in locals() or DEV: #check if data is defined and not empty
  print("Data is empty or DEV is True")
  # Load from github
  download("https://github.com/miam-bonbon/assignment-adv-nlp/raw/refs/heads/main/data/COVIDSenti_tokenized_01.parq", "data/COVIDSenti_tokenized_01.parq")
  data = load_dataframe("data/COVIDSenti_tokenized_01.parq")
  print(data.shape)

In [None]:
data.head()

Let's see how frequent the words (and their n-grams) are

In [None]:
from collections import Counter

def plot_ngram_counts(counter, n_most_common, title="Term frequencies"):
    """Plots the n-gram counts

    Parameters
    ----------
    counter : Counter
        The counter of the n-grams
    n_most_common : int
        The n most common n-grams to plot
    title : str
        The title of the plot
    """

    import matplotlib.pyplot as plt

    y = [count for tag, count in counter.most_common(n_most_common)]
    x = [tag for tag, count in counter.most_common(n_most_common)]

    plt.bar(x, y)
    plt.title(title)
    plt.ylabel("Frequency")
    #plt.yscale('log') # set log scale for y-axis
    plt.xticks(rotation=90)
    for i, (tag, count) in enumerate(counter.most_common(n_most_common)):
        plt.text(i, count, f' {count} ', rotation=90, ha='center', va='top' if i < 10 else 'bottom', color='white' if i < 10 else 'black')
    plt.xlim(-0.6, len(x)-0.4) # set tighter x lims
    plt.tight_layout() # change the whitespace such that all labels fit nicely
    plt.show()

In [None]:
from fhnw.nlp.utils.text import create_ngram_counts

for label in data["label"].unique():
  ngrams_1 = create_ngram_counts(data.loc[(data["label"] == label)], 1, "token_clean")
  ngrams_2 = create_ngram_counts(data.loc[(data["label"] == label)], 2, "token_clean")

  plot_ngram_counts(ngrams_1, 20, "Unigram Term frequencies of "+label)
  plot_ngram_counts(ngrams_2, 20, "Bigram Term frequencies of "+label)

Stopwords look alright

In [None]:
# # Extend the stopwords list
# additional_stopwords = {"example", "word", "anotherword"}  # Add your specific words here
# stopwords.update(additional_stopwords)

# # Optionally, remove words from the stopword list that are helpful for distinguishing classes
# words_to_remove = {"distinguishing", "word1", "word2"}  # Add words to remove
# stopwords -= words_to_remove

But let's optimize for tweets, we hava a tweet tokenizer in nltk - and keep punctuation for now

Lost stopwords from reload

In [None]:
%%capture

if ('stopwords' not in locals()):
  !pip install 'spacy>=3.0.5'
  !pip install nltk
  from nltk.corpus import stopwords
  from nltk.stem.snowball import SnowballStemmer

  import spacy
  !python3 -m spacy download en_core_web_md

  nlp = spacy.load("en_core_web_md")

  import nltk
  nltk.download('punkt')
  nltk.download('stopwords')

  stemmer = SnowballStemmer("english")
  stopwords = set(stopwords.words("english"))
  #stopwords = set(nlp.Defaults.stop_words)
  empty_stopwords = set()

In [None]:
import nltk

nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()

def tweet_tokenize(text, stopwords):
    """Tokenizes and lowercases a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens

    Returns
    -------
    list
        The tokenized text
    """
    from fhnw.nlp.utils.processing import is_iterable

    if isinstance(text, str):
      word_tokens = tweet_tokenizer.tokenize(text)
    elif is_iterable(text):
        word_tokens = text
    else:
        raise TypeError("Only string or iterable (e.g. list) is supported. Received a "+ str(type(text)))

    return [word.lower() for word in word_tokens if word.lower() not in stopwords]


def tweet_tokenize_stem(text, stopwords, stemmer):
    """Tokenizes, lowercases and stems a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens
    stemmer: stemmer
        The stemmer to use (e.g. SnowballStemmer)

    Returns
    -------
    list
        The tokenized and stemmed text
    """
    from fhnw.nlp.utils.processing import is_iterable

    if isinstance(text, str):
      word_tokens = tweet_tokenizer.tokenize(text)
    elif is_iterable(text):
        word_tokens = text
    else:
        raise TypeError("Only string or iterable (e.g. list) is supported. Received a "+ str(type(text)))

    # if not isinstance(stemmer, Callable):
    stemmer = stemmer.stem

    return [stemmer(word.lower()) for word in word_tokens if word.lower() not in stopwords]


def tweet_tokenize_lemma(text, stopwords, lemmanizer, keep_ners=False):
    """Tokenizes, lowercases and lemmatizes a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens
    lemmanizer: spacy nlp pipeline
        The lemmanizer to use (must be spacy nlp pipeline)
    keep_ner: bool
        Defines if named entities (NERs) should be keept in one token

    Returns
    -------
    list
        The tokenized and lemmatized text
    """
    from fhnw.nlp.utils.processing import is_iterable

    if isinstance(text, str):
        text = text
    elif is_iterable(text):
        from fhnw.nlp.utils.text import join_tokens
        text = join_tokens(text, set())
    else:
        raise TypeError("Only string or iterable (e.g. list) is supported. Received a "+ str(type(text)))

    if keep_ners:
        # HanoverTagger could be an alternative but takes longer
        # see: https://textmining.wp.hs-hannover.de/Preprocessing.html#Lemmatisierung-und-Wortarterkennung
        doc = lemmanizer(text, disable=['tagger', 'parser'])

        tokens = list()
        ner_idx = 0
        tok_idx = 0

        # keep ner in one token
        while tok_idx < len(doc):
            if ner_idx >= len(doc.ents) or doc[tok_idx].idx < doc.ents[ner_idx].start_char:
                if doc[tok_idx].is_alpha and not doc[tok_idx].is_punct and doc[tok_idx].text.lower() not in stopwords and doc[tok_idx].lemma_.lower() not in stopwords:
                    #print("token ", doc[tok_idx].lemma_.lower())
                    tokens.append(doc[tok_idx].lemma_.lower())

                tok_idx += 1
            else:
                #print("ner ", doc.ents[ner_idx].lemma_.lower())
                tokens.append(doc.ents[ner_idx].lemma_.lower())

                tok_idx += 1
                while tok_idx < len(doc) and doc[tok_idx].idx < doc.ents[ner_idx].end_char:
                    tok_idx += 1

                ner_idx += 1

        return tokens
    else:
        doc = lemmanizer(text, disable=['tagger', 'parser', 'ner'])
        return [tok.lemma_.lower() for tok in doc if tok.is_alpha and not tok.is_punct and tok.text.lower() not in stopwords and tok.lemma_.lower() not in stopwords]


def _tweet_normalize(text, stopwords, stemmer=None, lemmanizer=None, lemma_with_ner=False):
    """Normalizes (e.g. tokenize and stem) and lowercases a text and removes stopwords

    Parameters
    ----------
    text : str, iterable
        The text either as string or iterable of tokens (in this case tokenization is not applied)
    stopwords : set
        A set of stopword to remove from the tokens
    stemmer: stemmer
        The stemmer to use (e.g. SnowballStemmer) or None to disable stemming
    lemmanizer: spacy nlp pipeline
        The lemmanizer to use (must be spacy nlp pipeline) or None to disable lemmantization
    lemma_with_ner: bool
        Defines if named entities (NERs) should be keept in one token

    Returns
    -------
    list
        The normalized text
    """

    if lemmanizer is not None:
        return tweet_tokenize_lemma(text, stopwords, lemmanizer, keep_ners=lemma_with_ner)
    elif stemmer is not None:
        return tweet_tokenize_stem(text, stopwords, stemmer)
    else:
        return tweet_tokenize(text, stopwords)

Now let's use the the tweet_tokenize

In [None]:
%%time
data = data[data.columns.drop(list(data.filter(regex='token_clean_tweet_tokenize')))] # we always append
data = parallelize_dataframe(data, _tweet_normalize, field_read="cleaned_tweet", field_write="token_clean_tweet_tokenize", stopwords=stopwords, stemmer=None, lemmanizer=None, lemma_with_ner=False)

In [None]:
data.head(3)

Check the n-grams for tweet optimized

In [None]:
from fhnw.nlp.utils.text import create_ngram_counts

for label in data["label"].unique():
  ngrams_1 = create_ngram_counts(data.loc[(data["label"] == label)], 1, "token_clean_tweet_tokenize")
  ngrams_2 = create_ngram_counts(data.loc[(data["label"] == label)], 2, "token_clean_tweet_tokenize")

  plot_ngram_counts(ngrams_1, 20, "Unigram Term frequencies of tweet optimized "+label)
  plot_ngram_counts(ngrams_2, 20, "Bigram Term frequencies of tweet optimized "+label)

Nice! We keep our hashtags.

In [None]:
%%time

import pandas as pd
from google.colab import drive

if (DEV):
  # Mount Google Drive
  drive.mount('/content/drive')
  output_file_path = "/content/drive/MyDrive/COVIDSenti_tokenized_02.parq"  # save to github

  # Save the DataFrame to Parquet format
  data.to_parquet(output_file_path)

  save_dataframe(data, "data/COVIDSenti_tokenized_02.parq")