# 텍스트 사전 준비 작업 = 텍스트 정규화

## 토큰화

### 문장 토큰화
- 문장의 마침표, 개행문자등 문장의 마지막을 뜻하는 기호에 따라 분리하는 것
  nltk : https://www.nltk.org

In [2]:
!pip install nltk



In [3]:
from nltk import sent_tokenize
import nltk
# nltk.download('punkt')
text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
               You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes.'
sentence = sent_tokenize(text_sample)
print(len(sentence))
print(sentence)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


### 단어 토큰화
- 문장을 단어로 토큰화 하는 것
- 일반적으로 문장 토큰화는 각 문장이 가지는 의미가 중요한 요소로 사용될때 사용
- BoW(Bag of World)와 같이 단어의 순서가 중요하지 않는 경우 단어 토큰화만 해도 충분하다.

In [6]:
from nltk import word_tokenize
sentence = 'The Matrix is everywhere its all around us, here even in this room.'
words = word_tokenize(sentence)
print(len(words))
print(words)

15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


In [9]:
from nltk import sent_tokenize, word_tokenize
def tokenize_text(text):
    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens
word_tokens = tokenize_text(text_sample)
print(word_tokens)

[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


## stopwords 제거
- 분석에 큰 의미가 없는 단어를 지칭한다
- is, the, a, will 등 문맥적으로 큰 의미가 없는 단어가 이에 해당한다.

In [10]:
import nltk
nltk.download('stopwords') #불용어 목록을 다운받는다

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
print(nltk.corpus.stopwords.fileids())

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [12]:
print('영어 불용어 개수:',len(nltk.corpus.stopwords.words('english')) )
print(nltk.corpus.stopwords.words('english')[:20])

영어 불용어 개수: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


# 구두점 목록

In [19]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [20]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []
print('[원본단어]')
print(word_tokens)
for sentence in word_tokens:
    filtered_words = []
    for word in sentence:
        word = word.lower()
        if word not in stopwords and word not in string.punctuation:
            filtered_words.append(word)
    all_tokens.append(filtered_words)
print('[불용어 제거 단어]')
print(all_tokens)

[원본단어]
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]
[불용어 제거 단어]
[['matrix', 'everywhere', 'around', 'us', 'even', 'room'], ['see', 'window', 'television'], ['feel', 'go', 'work', 'go', 'church', 'pay', 'taxes']]


## 어간 추출(Stemming)과 표제어 추출(Lemmatization)
- 문법적 또는 의미적으로 변화하는 `단어의 원형을 찾는다.`

In [23]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('working'),stemmer.stem('works'), stemmer.stem('worked'))
print(stemmer.stem('amusing'),stemmer.stem('amuses'), stemmer.stem('amused'))
print(stemmer.stem('happier'),stemmer.stem('happiest'))

work work work
amus amus amus
happy happiest


In [26]:
from nltk.stem import WordNetLemmatizer
import nltk
#nltk.download('wordnet')

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing','v'), lemma.lemmatize('amused','v')) # v는 품사 : 동사 v 명사 n 형용사 a 부사 r
print(lemma.lemmatize('happier','a'), lemma.lemmatize('happiest','a'))

amuse amuse
happy happy


# Bag of Words (BoW)

## DTM(Document Term Matrix, 문서 단어 행렬)

### CounterVectorizer 사용

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love, because I love you.']
vector = CountVectorizer()
print('bag of words vector:',vector.fit_transform(corpus).toarray())
# 'I'는 BoW를 만드는 과정에서 제외됨(CounterVecterizer는 기본적으로 길이가 2 이상인 단어만 토큰으로 인식)
print('vocabulary: ', sorted(vector.vocabulary_.items(), key=lambda item:item[0]))

bag of words vector: [[1 1 2 1 2 1]]
vocabulary:  [('because', 0), ('know', 1), ('love', 2), ('want', 3), ('you', 4), ('your', 5)]


- 불용어를 제거한 BoW 만들기

In [None]:
1. 사용자 정의 불용어 사용

In [34]:
text = ["Family is not an important thing. It's everything"]
vector = CountVectorizer(stop_words=['the','an','a','is','not'])
print(vector.fit_transform(text).toarray())
print('vocabulary:', sorted(vector.vocabulary_.items(), key=lambda item:item[0]))

[[1 1 1 1 1]]
vocabulary: [('everything', 0), ('family', 1), ('important', 2), ('it', 3), ('thing', 4)]


In [None]:
2. CountVectorizer에서 제공하는 자체 불용어 사용

In [36]:
text = ["Family is not an important thing. It's everything."]
vector = CountVectorizer(stop_words='english')
print(vector.fit_transform(text).toarray())
print('vocabulary:', sorted(vector.vocabulary_.items(), key=lambda item:item[0]))

[[1 1 1]]
vocabulary: [('family', 0), ('important', 1), ('thing', 2)]


In [None]:
3. NLTK에서 지원하는 불용어 사용

In [33]:
from nltk.corpus import stopwords
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words('english')
vector = CountVectorizer(stop_words=stop_words)
print(vector.fit_transform(text).toarray())
print('vocabulary:', sorted(vector.vocabulary_.items(), key=lambda item:item[0]))

[[1 1 1 1]]
vocabulary: [('everything', 0), ('family', 1), ('important', 2), ('thing', 3)]


### TF-IDF(Term Frequency-Inverse Document Frequency)

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['you know I want your love', #문서 1
         'I like you', #문서 2
         'what should I do' #문서 3 
         ]
tfidf = TfidfVectorizer() 
print(tfidf.fit_transform(corpus).toarray())

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
