In [1]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from konlpy.tag import Okt
import matplotlib.pyplot as plt
import matplotlib as mpl

In [3]:
fileids = movie_reviews.fileids()
print(len(fileids), '\n\n', fileids[:10])

2000 

 ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


In [5]:
print(movie_reviews.raw(fileids[0])[:200])

plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
w


In [6]:
print(movie_reviews.sents(fileids[0])[:2])

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.']]


In [7]:
print(movie_reviews.words(fileids[0])[:20])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']


In [8]:
reviews = [movie_reviews.raw(fileid) for fileid in fileids]
reviews[:2]

['plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience memb

In [9]:
cv = CountVectorizer(max_features=1000, min_df=3, max_df=0.5)
print(cv)

CountVectorizer(max_df=0.5, max_features=1000, min_df=3)


In [11]:
reviews_cv = cv.fit_transform(reviews)
print(reviews_cv.shape, '\n\n', cv.get_feature_names_out()[:20])

(2000, 1000) 

 ['10' 'ability' 'able' 'above' 'absolutely' 'across' 'act' 'acting'
 'action' 'actor' 'actors' 'actress' 'actual' 'actually' 'add' 'again'
 'against' 'age' 'agent' 'ago']


In [18]:
print(type(reviews_cv), '\n\n', reviews_cv[0, :20])

<class 'scipy.sparse._csr.csr_matrix'> 

   (0, 15)	2
  (0, 13)	2
  (0, 10)	1
  (0, 19)	1
  (0, 0)	10


In [19]:
for word, count in zip(cv.get_feature_names_out()[:20], reviews_cv[0].toarray()[0, :20]):
    print(f'{word}:{count}', end=', ')

10:10, ability:0, able:0, above:0, absolutely:0, across:0, act:0, acting:0, action:0, actor:0, actors:1, actress:0, actual:0, actually:2, add:0, again:2, against:0, age:0, agent:0, ago:1, 

In [20]:
df = pd.read_csv('./data/daum_movie_review.csv')
df.head(10)

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워
5,나는 재밌게 봄,10,2018.10.14,인피니티 워
6,0.5점은 줄 수 없냐?,0,2018.10.10,인피니티 워
7,헐..다 죽었어....나중에 앤트맨 보다가도 깜놀...,10,2018.10.08,인피니티 워
8,충격 결말,9,2018.10.06,인피니티 워
9,응집력,8,2018.10.05,인피니티 워


In [22]:
okt = Okt()
tokens = [okt.morphs(text) for text in df['review']]
tokens[:20]

[['돈', '들인건', '티', '가', '나', '지만', '보는', '내내', '하품', '만'],
 ['몰입',
  '할수밖에',
  '없다',
  '.',
  '어렵게',
  '생각',
  '할',
  '필요없다',
  '.',
  '내',
  '가',
  '전투',
  '에',
  '참여',
  '한',
  '듯',
  '손',
  '에',
  '땀',
  '이남',
  '.'],
 ['이전',
  '작품',
  '에',
  '비해',
  '더',
  '화려하고',
  '스케일',
  '도',
  '커졌지만',
  '....',
  '전국',
  '맛집',
  '의',
  '음식',
  '들',
  '을',
  '한데',
  '모은',
  '것',
  '까지는',
  '좋았으나',
  '이',
  '걸',
  '모두',
  '한',
  '그릇',
  '에',
  '섞어',
  '버린',
  '듯',
  '한',
  '느낌',
  '...',
  '그래도',
  '다음',
  '작품',
  '을',
  '기대하게',
  '만든다',
  '...'],
 ['이', '정도', '면', '볼', '만', '하다고', '할', '수', '있음', '!'],
 ['재미있다'],
 ['나', '는', '재밌게', '봄'],
 ['0.5', '점', '은', '줄', '수', '없냐', '?'],
 ['헐', '..', '다', '죽었어', '....', '나중', '에', '앤트맨', '보다가도', '깜놀', '...'],
 ['충격', '결말'],
 ['응집', '력'],
 ['개연',
  '성은',
  '무시',
  '해라',
  '액션',
  '을',
  '즐겨라',
  '스타로드',
  '가',
  '이끌어',
  '준다',
  '각각',
  '의',
  '영웅',
  '들',
  '을',
  '즐겨라',
  '그리고',
  '단',
  '적',
  '인',
  '신념',
  '이',
  '얼마나',
  '부질없는지',
  '보셔라'],
 ['내', '가

In [23]:
def my_tokenizer(doc):
    return [token for token, pos in okt.pos(doc) if pos in ['Noun', 'Verb', 'Adjective']]
print(my_tokenizer(df['review'][1]))

['몰입', '할수밖에', '없다', '어렵게', '생각', '할', '필요없다', '내', '전투', '참여', '듯', '손', '땀', '이남']


In [24]:
cv = CountVectorizer(tokenizer=my_tokenizer, min_df=2, max_df=0.5, max_features=1000)
movie_cv = cv.fit_transform(df['review'])
print(cv.get_feature_names_out()[:100])



['가' '가는' '가는줄' '가면' '가서' '가슴' '가장' '가족' '가족영화' '가지' '가치' '각색' '간' '간다'
 '간만' '갈' '갈수록' '감' '감독' '감동' '감사' '감사합니다' '감상' '감성' '감정' '감탄' '갑자기' '갔는데'
 '갔다' '갔다가' '강' '강철' '강추' '같고' '같네요' '같다' '같습니다' '같아' '같아요' '같은' '같은데'
 '같음' '개' '개그' '개봉' '개연' '개인' '거' '거기' '거리' '거의' '걱정' '건' '건가' '건지' '걸'
 '겁니다' '것' '게' '겨울왕국' '결론' '결말' '경찰' '경험' '계속' '고' '고맙습니다' '고민' '고생' '곤지암'
 '곳' '공감' '공포' '공포영화' '과' '과거' '관' '관객' '관객수' '관람' '광주' '괜찮은' '교훈' '구성'
 '국내' '국민' '군인' '군함도' '굿' '권선' '귀신' '귀인' '그' '그것' '그게' '그날' '그냥' '그닥'
 '그대로' '그때']


In [25]:
movie_cv

<14725x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 110791 stored elements in Compressed Sparse Row format>

In [26]:
cv = CountVectorizer(max_features=1000, min_df=3, max_df=0.5)
reviews_cv = cv.fit_transform(reviews)
print(reviews_cv.shape, '\n\n', cv.get_feature_names_out()[:20])

(2000, 1000) 

 ['10' 'ability' 'able' 'above' 'absolutely' 'across' 'act' 'acting'
 'action' 'actor' 'actors' 'actress' 'actual' 'actually' 'add' 'again'
 'against' 'age' 'agent' 'ago']


In [32]:
start = len(reviews[3]) // 2
source = reviews[3][-start:]
source_cv = cv.transform([source])
sim_result = cosine_similarity(source_cv, reviews_cv)
print(sorted(sim_result[0], reverse=True)[:10], '\n\n', np.argmax(sim_result[0]))

[0.7863570994927839, 0.29675603889533164, 0.2913971185543096, 0.287560657076852, 0.28240226157805165, 0.2794782784191074, 0.2790320425660623, 0.27542042454474985, 0.2682405828337715, 0.2681913598619224] 

 3


In [33]:
print(sim_result[0].argsort()[:10:-1])

[   3 1557  141 ... 1390  616 1689]
