In [24]:
import nltk
from nltk.corpus import movie_reviews, stopwords

from konlpy.tag import Okt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np

from joblib import Parallel, delayed

In [2]:
#nltk.download('movie_reviews')

In [3]:
print(len(movie_reviews.fileids()))
print(movie_reviews.fileids()[:10])
fileid = movie_reviews.fileids()[0]
print(movie_reviews.raw(fileid)[:100])
print(movie_reviews.sents(fileid)[:2])
print(movie_reviews.words(fileid)[:20])

2000
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']
plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.']]
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']


In [4]:
reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
print(reviews[0][:100])

plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 



In [5]:
cv = CountVectorizer(max_features=1000)
print(cv)

CountVectorizer(max_features=1000)


In [6]:
reviews_cv = cv.fit_transform(reviews)
print(cv.get_feature_names_out()[:20])
print(type(reviews_cv), reviews_cv.shape)

['10' 'ability' 'able' 'about' 'above' 'absolutely' 'across' 'act'
 'acting' 'action' 'actor' 'actors' 'actress' 'actual' 'actually' 'add'
 'after' 'again' 'against' 'age']
<class 'scipy.sparse._csr.csr_matrix'> (2000, 1000)


In [7]:
for word, count in zip(cv.get_feature_names_out()[:20], reviews_cv[0].toarray()[0, :20]):
    print(f'{word}: {count}', end=', ')

10: 10, ability: 0, able: 0, about: 2, above: 0, absolutely: 0, across: 0, act: 0, acting: 0, action: 0, actor: 0, actors: 1, actress: 0, actual: 0, actually: 2, add: 0, after: 2, again: 2, against: 0, age: 0, 

In [10]:
df = pd.read_csv('./data/daum_movie_review.csv')
df.head(5)

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워


In [12]:
okt = Okt()
print(okt.morphs(df['review'][1]))
print(okt.nouns(df['review'][1]))
print(okt.pos(df['review'][1]))

['몰입', '할수밖에', '없다', '.', '어렵게', '생각', '할', '필요없다', '.', '내', '가', '전투', '에', '참여', '한', '듯', '손', '에', '땀', '이남', '.']
['몰입', '생각', '내', '전투', '참여', '듯', '손', '땀', '이남']
[('몰입', 'Noun'), ('할수밖에', 'Verb'), ('없다', 'Adjective'), ('.', 'Punctuation'), ('어렵게', 'Adjective'), ('생각', 'Noun'), ('할', 'Verb'), ('필요없다', 'Adjective'), ('.', 'Punctuation'), ('내', 'Noun'), ('가', 'Josa'), ('전투', 'Noun'), ('에', 'Josa'), ('참여', 'Noun'), ('한', 'Determiner'), ('듯', 'Noun'), ('손', 'Noun'), ('에', 'Josa'), ('땀', 'Noun'), ('이남', 'Noun'), ('.', 'Punctuation')]


In [13]:
def my_tokenizer(doc):
    return [token for token, pos in okt.pos(doc) if pos in ['Noun', 'Verb', 'Adjective']]

In [14]:
print(my_tokenizer(df['review'][1]))

['몰입', '할수밖에', '없다', '어렵게', '생각', '할', '필요없다', '내', '전투', '참여', '듯', '손', '땀', '이남']


In [15]:
daum_cv = CountVectorizer(max_features=1000, tokenizer=my_tokenizer)
daum_DTM = daum_cv.fit_transform(df['review'])
print(daum_cv.get_feature_names_out()[:100])



['가' '가는' '가는줄' '가면' '가서' '가슴' '가장' '가족' '가족영화' '가지' '가치' '각색' '간' '간다'
 '간만' '갈' '갈수록' '감' '감독' '감동' '감사' '감사합니다' '감상' '감성' '감정' '감탄' '갑자기' '갔는데'
 '갔다' '갔다가' '강' '강철' '강추' '같고' '같네요' '같다' '같습니다' '같아' '같아요' '같은' '같은데'
 '같음' '개' '개그' '개봉' '개연' '개인' '거' '거기' '거리' '거의' '걱정' '건' '건가' '건지' '걸'
 '겁니다' '것' '게' '겨울왕국' '결론' '결말' '경찰' '경험' '계속' '고' '고맙습니다' '고민' '고생' '곤지암'
 '곳' '공감' '공포' '공포영화' '과' '과거' '관' '관객' '관객수' '관람' '광주' '괜찮은' '교훈' '구성'
 '국내' '국민' '군인' '군함도' '굿' '권선' '귀신' '귀인' '그' '그것' '그게' '그날' '그냥' '그닥'
 '그대로' '그때']


In [30]:
start = len(reviews[0]) // 2
source = reviews[0][-start:]
source_cv = cv.transform([source])
sim_result = cosine_similarity(source_cv, reviews_cv)
print(sorted(sim_result[0], reverse=True)[:10])
print(np.argmax(sim_result[0]))
print((-sim_result[0]).argsort()[:10])

[np.float64(0.9406850065028822), np.float64(0.8449222207991461), np.float64(0.8348806607881823), np.float64(0.8326052445239385), np.float64(0.8315967745603229), np.float64(0.8273858498106984), np.float64(0.8245530382632896), np.float64(0.8219627744618032), np.float64(0.820420990734705), np.float64(0.8203286730382311)]
0
[   0  126 1501  100 1846  882 1560 1110    9 1570]
