# Import các thư viện cần thiết

In [65]:
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

# Khảo sát dữ liệu
Mục đích: Đánh giá sơ bộ dữ liệu để xác định pipeline tiền xử lý


In [117]:
corpus = [
    "I can't wait for the new season of my favorite show!",
    "The COVID-19 pandemic has affected millions of people worldwide.",
    "US stocks fell on Friday after news of rising inflation.",
    "Python is a great programming language!!! ??",
    "Hello #Falcon%%%, I'm excited for the project launch."
]

# Pipeline tiền xử lý dữ liệu

## Loại bỏ các ký tự đặc biệt

In [118]:
def clean_text(text):
  text = re.sub(r"[^A-Za-z()\']", " ", text)
  text = re.sub(r"\s+", " ", text).strip()
  return text.strip()

In [119]:
cleaned_corpus = [clean_text(doc) for doc in corpus]
for doc in cleaned_corpus:
    print(doc)

I can't wait for the new season of my favorite show
The COVID pandemic has affected millions of people worldwide
US stocks fell on Friday after news of rising inflation
Python is a great programming language
Hello Falcon I'm excited for the project launch


## Chuyển văn bản về chữ thường

In [120]:
def lowercaser(text):
  return text.lower()

In [121]:
cleaned_corpus = [lowercaser(doc) for doc in cleaned_corpus]
for doc in cleaned_corpus:
    print(doc)

i can't wait for the new season of my favorite show
the covid pandemic has affected millions of people worldwide
us stocks fell on friday after news of rising inflation
python is a great programming language
hello falcon i'm excited for the project launch


## Chuấn hóa từ viết tắt

In [122]:
def decontracting_words(sentence):
    sentence = sentence.lower()
    contractions = {
        r"ain't": "am not",
        r"aren't": "are not",
        r"can't": "cannot",
        r"can't've": "cannot have",
        r"'cause": "because",
        r"could've": "could have",
        r"couldn't": "could not",
        r"couldn't've": "could not have",
        r"didn't": "did not",
        r"doesn't": "does not",
        r"don't": "do not",
        r"hadn't": "had not",
        r"hadn't've": "had not have",
        r"hasn't": "has not",
        r"haven't": "have not",
        r"he'd": "he would",
        r"he'd've": "he would have",
        r"he'll": "he will",
        r"he'll've": "he will have",
        r"he's": "he is",
        r"how'd": "how did",
        r"how'd'y": "how do you",
        r"how'll": "how will",
        r"how's": "how is",
        r"i'd": "i would",
        r"i'd've": "i would have",
        r"i'll": "i will",
        r"i'll've": "i will have",
        r"i'm": "i am",
        r"i've": "i have",
        r"isn't": "is not",
        r"it'd": "it would",
        r"it'd've": "it would have",
        r"it'll": "it will",
        r"it'll've": "it will have",
        r"it's": "it is",
        r"let's": "let us",
        r"ma'am": "madam",
        r"might've": "might have",
        r"mightn't": "might not",
        r"mightn't've": "might not have",
        r"must've": "must have",
        r"mustn't": "must not",
        r"mustn't've": "must not have",
        r"needn't": "need not",
        r"needn't've": "need not have",
        r"o'clock": "of the clock",
        r"oughtn't": "ought not",
        r"oughtn't've": "ought not have",
        r"shan't": "shall not",
        r"shan't've": "shall not have",
        r"she'd": "she would",
        r"she'd've": "she would have",
        r"she'll": "she will",
        r"she'll've": "she will have",
        r"she's": "she is",
        r"should've": "should have",
        r"shouldn't": "should not",
        r"shouldn't've": "should not have",
        r"so've": "so have",
        r"that'd": "that would",
        r"that'd've": "that would have",
        r"that's": "that is",
        r"there'd": "there would",
        r"there'd've": "there would have",
        r"there's": "there is",
        r"they'd": "they would",
        r"they'd've": "they would have",
        r"they'll": "they will",
        r"they'll've": "they will have",
        r"they're": "they are",
        r"they've": "they have",
        r"to've": "to have",
        r"wasn't": "was not",
        r"we'd": "we would",
        r"we'd've": "we would have",
        r"we'll": "we will",
        r"we'll've": "we will have",
        r"we're": "we are",
        r"we've": "we have",
        r"weren't": "were not",
        r"what'll": "what will",
        r"what'll've": "what will have",
        r"what're": "what are",
        r"what's": "what is",
        r"what've": "what have",
        r"when's": "when is",
        r"when've": "when have",
        r"where'd": "where did",
        r"where's": "where is",
        r"where've": "where have",
        r"who'll": "who will",
        r"who'll've": "who will have",
        r"who's": "who is",
        r"who've": "who have",
        r"why's": "why is",
        r"why've": "why have",
        r"will've": "will have",
        r"won't": "will not",
        r"won't've": "will not have",
        r"would've": "would have",
        r"wouldn't": "would not",
        r"wouldn't've": "would not have",
        r"y'all": "you all",
        r"y'all'd": "you all would",
        r"y'all'd've": "you all would have",
        r"y'all're": "you all are",
        r"y'all've": "you all have",
        r"you'd": "you would",
        r"you'd've": "you would have",
        r"you'll": "you will",
        r"you'll've": "you will have",
        r"you're": "you are",
        r"you've": "you have"
    }

    pattern = re.compile(r'\b' + r'\b|\b'.join(contractions.keys()) + r'\b')
    sentence = pattern.sub(lambda match: contractions[match.group(0)], sentence)

    return sentence


In [123]:
cleaned_corpus = [decontracting_words(doc) for doc in cleaned_corpus]
for doc in cleaned_corpus:
    print(doc)

i cannot wait for the new season of my favorite show
the covid pandemic has affected millions of people worldwide
us stocks fell on friday after news of rising inflation
python is a great programming language
hello falcon i am excited for the project launch


## Tách từ

In [124]:
def tokenize(text):
    text = word_tokenize(text)
    return text

In [125]:
cleaned_corpus = [tokenize(doc) for doc in cleaned_corpus]
for doc in cleaned_corpus:
    print(doc)

['i', 'can', 'not', 'wait', 'for', 'the', 'new', 'season', 'of', 'my', 'favorite', 'show']
['the', 'covid', 'pandemic', 'has', 'affected', 'millions', 'of', 'people', 'worldwide']
['us', 'stocks', 'fell', 'on', 'friday', 'after', 'news', 'of', 'rising', 'inflation']
['python', 'is', 'a', 'great', 'programming', 'language']
['hello', 'falcon', 'i', 'am', 'excited', 'for', 'the', 'project', 'launch']


##Loại bỏ stopwords

In [126]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(word_list):
  word_list = [word for word in word_list if word not in stop_words or word in ['not', 'can']]
  return word_list

In [127]:
cleaned_corpus = [remove_stopwords(doc) for doc in cleaned_corpus]
for doc in cleaned_corpus:
    print(doc)

['can', 'not', 'wait', 'new', 'season', 'favorite', 'show']
['covid', 'pandemic', 'affected', 'millions', 'people', 'worldwide']
['us', 'stocks', 'fell', 'friday', 'news', 'rising', 'inflation']
['python', 'great', 'programming', 'language']
['hello', 'falcon', 'excited', 'project', 'launch']


## Đưa về từ gốc

In [128]:
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize(word_list):
    pos_tags = pos_tag(word_list)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return lemmatized_words

In [129]:
cleaned_corpus = [lemmatize(doc) for doc in cleaned_corpus]
for doc in cleaned_corpus:
    print(doc)

['can', 'not', 'wait', 'new', 'season', 'favorite', 'show']
['covid', 'pandemic', 'affect', 'million', 'people', 'worldwide']
['u', 'stock', 'fell', 'friday', 'news', 'rise', 'inflation']
['python', 'great', 'programming', 'language']
['hello', 'falcon', 'excite', 'project', 'launch']


In [132]:
example = "He is running".split()
lemmatize(example)

['He', 'be', 'run']

In [133]:
example = "He went for a running".split()
lemmatize(example)

['He', 'go', 'for', 'a', 'running']

## Tổng hợp pipeline tiền xử lý

In [134]:
def preprocessing(text):
    text = clean_text(text)
    text = lowercaser(text)
    text = decontracting_words(text)
    text = tokenize(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text

In [136]:
cleaned_corpus = [preprocessing(doc) for doc in corpus]
for doc in cleaned_corpus:
    print(doc)

['can', 'not', 'wait', 'new', 'season', 'favorite', 'show']
['covid', 'pandemic', 'affect', 'million', 'people', 'worldwide']
['u', 'stock', 'fell', 'friday', 'news', 'rise', 'inflation']
['python', 'great', 'programming', 'language']
['hello', 'falcon', 'excite', 'project', 'launch']
