#### Lemmatization of COPOM Minutes

This notebook prepares textual data for Natural Language Processing (NLP) by performing lemmatization.
 * The script tokenizes the raw text, removes common English stop words and non-alphabetic characters, and reduces words to their base or dictionary form (lemmatization).
 * It reads text files from the data/processed/copom_minutes_txt folder and saves the processed, lemmatized text into the data/processed/copom_minutes_lemmatized folder.

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import glob
import os

In [2]:
FOLDER_MINUTES_TXT = "./data/processed/copom_minutes_txt"
FOLDER_MINUTES_LEMMATIZED = "./data/processed/copom_minutes_lemmatized"

In [3]:
def get_wordnet_pos(treebank_tag):
    """
    This helper function is necessary for accurate lemmatization.
    Converting a Penn Treebank POS tag to a WordNet POS tag.

    Parameters:
    - treebank_tag (str): The POS tag from nltk.pos_tag().

    Returns:
    - str: The corresponding WordNet POS tag. Returns 'n' (noun) by default if the tag is not a recognized.
    """
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN

In [4]:
def preprocess_text(text):
    """
    Tokenizes, removes stop words, and lemmatizes a given string of text.

    The function first converts the text to lowercase, then tokenizes it. It
    filters out stop words and non-alphabetic tokens before performing tagging and lemmatization.

    Parameters:
    - text (str): The raw text to be processed.

    Returns:
    - List[str]: A list of lemmatized words, with stop words and punctuation removed.
    """

    text = text.lower()

    tokens = word_tokenize(text)
    stop_words_english = set(stopwords.words('english'))
    pos_tagged_tokens = nltk.pos_tag(tokens)

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []

    for word, tag in pos_tagged_tokens:
        if word not in stop_words_english and word.isalpha() and len(word) > 1:
            wordnet_pos = get_wordnet_pos(tag)
            lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
            lemmatized_tokens.append(lemma)
            
    return lemmatized_tokens

In [5]:
filepaths = glob.glob(f"{FOLDER_MINUTES_TXT}/*.txt")

for path in filepaths:
    filename = os.path.basename(path)
        
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
        
    lemm_texts = [preprocess_text(p) for p in paragraphs]

    with open(f"{FOLDER_MINUTES_LEMMATIZED}/{filename}", 'w', encoding='utf-8') as f:
        for paragraph in lemm_texts:
            line = ' '.join(paragraph)
            f.write(line + '\n')