Count 기반 문서표현 방법 즉, Bag of Words (단어의 빈도 기반 문서 표현)
머신러닝이나 딥러닝 모델은 숫자만 처리할 수 있죠. 그래서 텍스트(문장, 문서)를 숫자로 바꿔야 해요.

In [1]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/kangminji/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /Users/kangminji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# 데이터셋 구조 확인
from nltk.corpus import movie_reviews
print(f'review count : {len(movie_reviews.fileids())}')
print(f'categories of reviews : {movie_reviews.categories()}')

review count : 2000
categories of reviews : ['neg', 'pos']


In [3]:
# BOW 카운트 백터 생성
# 수동 구현
documents = [ movie_reviews.words(fileid) for fileid in movie_reviews.fileids() ]
documents

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...],
 ['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...],
 ['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...],
 ['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...],
 ['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...],
 ['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...],
 ['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...],
 ['that', "'", 's', 'exactly', 'how', 'long', 'the', ...],
 ['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...],
 ['plot', ':', 'a', 'young', 'french', 'boy', 'sees', ...],
 ['best', 'remembered', 'for', 'his', 'understated', ...],
 ['janeane', 'garofalo', 'in', 'a', 'romantic', ...],
 ['and', 'now', 'the', 'high', '-', 'flying', 'hong', ...],
 ['a', 'movie', 'like', 'mortal', 'kombat', ':', ...],
 ['she', 'was', 'the', 'femme', 'in', '"', 'la', ...],
 ['john', 'carpenter', 'makes', 'b', '-', 'movies', '.', ...],
 ['i', "'", 'm', 'really', 'starting', 'to', 'w

In [None]:
# 단어 빈도 계산
word_count = {}
for text in documents:
    for word in text:
        word_count[word] = word_count.get(word,0) + 1 
sorted_features = sorted(word_count, key=word_count.get,reverse=True)

In [None]:
# 전처리 및 재계산
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
tokenizer = RegexpTokenizer("[\\w']{3,}")
english_stops = set(stopwords.words('english'))
documents = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
tokens = [ [token for token in tokenizer.tokenize(doc) if token not in english_stops] for doc in documents ]


In [None]:
word_count = {}
for text in tokens:
    for word in text:
        word_count[word] = word_count.get(word,0) + 1
sorted_featrues = sorted(word_count, key=word_count.get, reverse=True)        
for word in sorted_featrues[:10]:
    print(f"{word} : {word_count[word]}")

In [None]:
# CountVectorizer 문서를 벡터화
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(vocabulary=sorted_featrues)
cv

In [None]:
reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
reviews[0]

### ---------------------------------------------------------------------------------