# Text preprocessing
- 클렌징 (Cleansing)
- 토큰화 (Tokenization)
- 필터링 (Filtering) / Stop word 제거 / 철자 수정
- 어간 추출 (Stemming)
- 표제어 추출 (Lemmatization)

## 토큰화 (Tokenization)

In [8]:
from nltk import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

text_sample = 'The matrix is everywhere its all around us, here even in my room. You can see it out your window or on your tv. you feel it when you go to work, or go to church or pay your taxes.'
sentences = sent_tokenize(text =text_sample)
print('sentence tokenization' )
print(len(sentences))
print (sentences);print('='*50)

words = word_tokenize(text= text_sample)
print('words tokenization : ')
print(len(words))
print(words);print('='*50)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
sentence tokenization
3
['The matrix is everywhere its all around us, here even in my room.', 'You can see it out your window or on your tv.', 'you feel it when you go to work, or go to church or pay your taxes.']
words tokenization : 
45
['The', 'matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'my', 'room', '.', 'You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'tv', '.', 'you', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']


In [10]:
def tokenize_text(text):
    sentences = sent_tokenize(text=text)
    words = [word_tokenize(_) for _ in sentences]
    return words

result = tokenize_text(text=text_sample)
print(result)

[['The', 'matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'my', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'tv', '.'], ['you', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


##  Stop word 제거


In [11]:
import nltk 
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
all_tokens=[]

for sentence in result :
    filtered_words=[]
    for word in sentence :
        word = word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

print(all_tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[['matrix', 'everywhere', 'around', 'us', ',', 'even', 'room', '.'], ['see', 'window', 'tv', '.'], ['feel', 'go', 'work', ',', 'go', 'church', 'pay', 'taxes', '.']]


## 어간 추출 (Stemming) 표제어 추출 (Lemmatization)

In [24]:
from nltk.stem import LancasterStemmer, WordNetLemmatizer
stemmer = LancasterStemmer()
lemmer = WordNetLemmatizer()
nltk.download('wordnet')
words_list = [['working','works','worked'],['amusing','amuses','amused'],['happier','happiest'],['fancier','fanciest']]
word_t=['v','v','a','a']
for i, words in enumerate(words_list):
    stem = []; lemm=[]
    for word in words :
        stem.append(stemmer.stem(word)) 
        lemm.append(lemmer.lemmatize(word, word_t[i])) 
    print('stem : ',stem)
    print('lemm : ',lemm)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
stem :  ['work', 'work', 'work']
lemm :  ['work', 'work', 'work']
stem :  ['amus', 'amus', 'amus']
lemm :  ['amuse', 'amuse', 'amuse']
stem :  ['happy', 'happiest']
lemm :  ['happy', 'happy']
stem :  ['fant', 'fanciest']
lemm :  ['fancy', 'fancy']
