In [1]:
from num2words.lang_PL import suffixes

default_installations = True
if default_installations:
    !pip install -q num2words autocorrect
else:
    import requests

    text_file_path = "requirements__Ch4_Preprocessing_Pipeline.txt"
    url = "https://raw.githubusercontent.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs/main/Chapter4_notebooks/" + text_file_path
    res = requests.get(url)
    with open(text_file_path, "w") as f:
        f.write(res.text)

    !pip install -r requirements__Ch4_Preprocessing_Pipeline.txt

In [3]:
import re
from num2words import num2words
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/llouis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/llouis/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/llouis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def decode(text):
    text = re.sub("\\n|\\r|\\t|-", " ", text)
    subject_line_search = re.search(r"<SUBJECT LINE>(.*?)<END>", text, flags=re.S)
    body_text_search = re.search(r"<BODY TEXT>(.*?)<END>", text, flags=re.S)

    formated_output  = ""
    if subject_line_search:
        formated_output = formated_output + subject_line_search.groups()[0] + ". "
    if body_text_search:
        formated_output = formated_output + body_text_search.groups()[0] + "."
    return formated_output


def digits_to_words(match):
    suffixes = ['st', 'nd', 'rd', 'th']
    string = match[0].lower()
    if string[-2:] in suffixes:
        type = 'ordinal'
        string = string[:-2]
    else:
        type = 'cardinal'
    return num2words(string, to=type)


def spelling_correction(text):
    corrector = Speller()
    spells = [corrector(word) for word in text.split()]
    return " ".join(spells)


def remove_stop_words(text):
    stopwords_set = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word not in stopwords_set])


def stemming(text):
    stemmer = PorterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])


def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [12]:
def preprocessing(input_text, printing=False):
    output = input_text

    output = decode(output)
    print("\n디코딩/인코딩 제거:\n        ", output)

    output = output.lower()
    print("\n소문자로 변환:\n        ", output)

    output = re.sub(r'\d+(st)?(nd)?(rd)?(th)?', digits_to_words, output, flags=re.IGNORECASE)
    print("\n숫자를 단어로 변환:\n        ", output)

    output = re.sub('[^A-Za-z0-9]+', ' ', output)
    print("\n구두점 및 기타 특수 문자 제거:\n        ", output)

    output = spelling_correction(output)
    print("\n철자 교정:\n        ", output)

    output = remove_stop_words(output)
    print("\n불용어 제거:\n        ", output)

    output = stemming(output)
    print("\n어간 추출:\n        ", output)

    output = lemmatizing(output)
    print("\n표제어 추출:\n        ", output)

    return output

In [13]:
raw_text_input = """
"<SUBJECT LINE> Employees details<END><BODY TEXT>Attached are 2 files,\n1st one is pairoll, 2nd is healtcare!<END>"
"""
print(f"This is the input raw text:\n{raw_text_input}")
print(f"\n----------------------------\nThis is the preprocessed text:\n        {preprocessing(raw_text_input, printing=True)}")

This is the input raw text:

"<SUBJECT LINE> Employees details<END><BODY TEXT>Attached are 2 files,
1st one is pairoll, 2nd is healtcare!<END>"


디코딩/인코딩 제거:
          Employees details. Attached are 2 files, 1st one is pairoll, 2nd is healtcare!.

소문자로 변환:
          employees details. attached are 2 files, 1st one is pairoll, 2nd is healtcare!.

숫자를 단어로 변환:
          employees details. attached are two files, first one is pairoll, second is healtcare!.

구두점 및 기타 특수 문자 제거:
          employees details attached are two files first one is pairoll second is healtcare 

철자 교정:
         employees details attached are two files first one is payroll second is healthcare

불용어 제거:
         employees details attached two files first one payroll second healthcare

어간 추출:
         employe detail attach two file first one payrol second healthcar

표제어 추출:
         employe detail attach two file first one payrol second healthcar

----------------------------
This is the preprocessed text:
        empl