# 1. BOW 기반 카운트 벡터

In [1]:
import nltk

In [2]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/mkw/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [3]:
from nltk.corpus import movie_reviews

print('#review count: ', len(movie_reviews.fileids()))
print('#samples of file ids: ', movie_reviews.fileids()[:10])

#review count:  2000
#samples of file ids:  ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


In [7]:
fileid = movie_reviews.fileids()[0]
print('#id of the first review: ', fileid)
print()
print('#first review content: \n', movie_reviews.raw(fileid)[:200])
print()
print('#sentence tokenization result: ', movie_reviews.sents(fileid)[:2])
print()
print('#word tokenization result: ', movie_reviews.words(fileid)[:20])

#id of the first review:  neg/cv000_29416.txt

#first review content: 
 plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
w

#sentence tokenization result:  [['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.']]

#word tokenization result:  ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']


In [8]:
documents = [list(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()]
print(documents[0][:50])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch']


In [9]:
word_count = {}
for text in documents:
    for word in text:
        word_count[word] = word_count.get(word, 0) + 1

sorted_features = sorted(word_count, key = word_count.get, reverse = True)
for word in sorted_features[:10]:
    print(f"count of '{word}': {word_count[word]}", end = ', ')

count of ',': 77717, count of 'the': 76529, count of '.': 65876, count of 'a': 38106, count of 'and': 35576, count of 'of': 34123, count of 'to': 31937, count of ''': 30585, count of 'is': 25195, count of 'in': 21822, 

In [15]:
# 불용어 제거
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer("[\w']{3,}")
english_stops = set(stopwords.words('english'))

In [17]:
documents = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

tokens = [[token for token in tokenizer.tokenize(doc) if token not in english_stops] for doc in documents]
word_count = {}
for text in tokens:
    for word in text:
        word_count[word] = word_count.get(word, 0) + 1

sorted_features = sorted(word_count, key = word_count.get, reverse = True)

print('num of features: ', len(sorted_features))
for word in sorted_features[:10]:
    print(f"count of '{word}' : {word_count[word]}", end = ', ')

num of features:  43030
count of 'film' : 8935, count of 'one' : 5791, count of 'movie' : 5538, count of 'like' : 3690, count of 'even' : 2564, count of 'time' : 2409, count of 'good' : 2407, count of 'story' : 2136, count of 'would' : 2084, count of 'much' : 2049, 

In [27]:
word_features = sorted_features[:1000]

In [33]:
def document_features(document, word_features):
    word_count = {}
    for word in document: #document에 있는 단어들에 대해 빈도수를 먼저 계산
        word_count[word] = word_count.get(word, 0) + 1
        
    features = []
    for word in word_features: #word_features의 단어에 대해 계산된 빈도수를 feature에 추가
        features.append(word_count.get(word, 0)) #빈도가 없는 단어는 0을 입력
    return features

In [25]:
word_features_ex = ['one', 'two', 'teen', 'couples', 'solo']
doc_ex = ['two', 'two', 'couples']
print(document_features(doc_ex, word_features_ex))

[0, 2, 0, 1, 0]


In [28]:
feature_sets = [document_features(d, word_features) for d in tokens]

for i in range(20):
    print(f'({word_features[i]}, {feature_sets[0][i]})', end = ', ')

(film, 5), (one, 3), (movie, 6), (like, 3), (even, 3), (time, 0), (good, 2), (story, 0), (would, 1), (much, 0), (also, 1), (get, 3), (character, 1), (two, 2), (well, 1), (first, 0), (characters, 1), (see, 2), (way, 3), (make, 5), 

In [29]:
print(feature_sets[0][-20:])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
