In [1]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from konlpy.tag import Okt
import matplotlib.pyplot as plt
import matplotlib as mpl



In [2]:
fileids = movie_reviews.fileids()
len(fileids)

2000

In [3]:
categories = movie_reviews.categories()
categories

['neg', 'pos']

In [4]:
print(movie_reviews.raw(movie_reviews.fileids()[0]))

plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . 
which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn't snag this one correctly . 
they seem to have taken this pretty neat concept , but executed it terribly . 
so what are the problems with the movie ? 
well , its main problem is that it's simply too jumbled . 
it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no id

In [5]:
reviews = [movie_reviews.raw(fileid) for fileid in fileids]

In [6]:
cv = CountVectorizer(token_pattern="[\w']{2,}", min_df=5, max_df=0.5)
cv

In [7]:
reviews_cv = cv.fit_transform(reviews)
cv.get_feature_names_out()[:20]

array(["'50s", "'60s", "'70s", "'80s", "'90s", "'97", "'cause", "'em",
       "'n", "'s", "'the", '00', '000', '007', '10', '100', '1000', '101',
       '102', '105'], dtype=object)

In [8]:
reviews_cv

<2000x13549 sparse matrix of type '<class 'numpy.int64'>'
	with 512913 stored elements in Compressed Sparse Row format>

In [9]:
for word, count in zip(cv.get_feature_names_out()[:20], reviews_cv[0].toarray()[0, :20]):
    print(f'{word}:{count}', end=', ')

'50s:0, '60s:0, '70s:0, '80s:0, '90s:0, '97:0, 'cause:0, 'em:0, 'n:0, 's:0, 'the:0, 00:0, 000:0, 007:0, 10:10, 100:0, 1000:0, 101:0, 102:0, 105:0, 

In [10]:
df = pd.read_csv('./data/daum_movie_review.csv')
df.head(10)

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워
5,나는 재밌게 봄,10,2018.10.14,인피니티 워
6,0.5점은 줄 수 없냐?,0,2018.10.10,인피니티 워
7,헐..다 죽었어....나중에 앤트맨 보다가도 깜놀...,10,2018.10.08,인피니티 워
8,충격 결말,9,2018.10.06,인피니티 워
9,응집력,8,2018.10.05,인피니티 워


In [11]:
okt = Okt()

def my_tokenizer(doc):
    return [token for token, pos in okt.pos(doc) if pos in ['Noun', 'Verb', 'Adjective']]

In [12]:
daum_cv = CountVectorizer(tokenizer=my_tokenizer, max_df=0.5, min_df=3)
daum_DTM = daum_cv.fit_transform(df.review)



In [13]:
print(daum_cv.get_feature_names_out()[:20])

['가' '가가' '가게' '가겠구나' '가고' '가기' '가길' '가까운' '가까이' '가끔' '가난' '가난한' '가네'
 '가네요' '가는' '가는게' '가는줄' '가는줄도' '가는지' '가능']


In [14]:
daum_DTM

<14725x7510 sparse matrix of type '<class 'numpy.int64'>'
	with 160996 stored elements in Compressed Sparse Row format>

In [15]:
start = len(reviews[0]) // 2
source = reviews[0][-start:]
source_cv = cv.transform([source])

In [16]:
sim_result = cosine_similarity(source_cv, reviews_cv)
print(sorted(sim_result[0], reverse=True)[:10])

[0.8133571997439815, 0.4377260011235029, 0.40788385121640586, 0.3943460801762568, 0.3762392165682158, 0.3722086654901401, 0.3704628227294709, 0.3684668102586341, 0.36511111733428236, 0.3641964763358985]


In [18]:
np.argmax(sim_result[0])

0

In [25]:
(-sim_result[0]).argsort()[:10]

array([   0, 1846,  420, 1570, 1913, 1642,  225, 1393, 1917, 1993],
      dtype=int64)

In [30]:
sim_result[0].argsort()[:-11:-1]

array([   0, 1846,  420, 1570, 1913, 1642,  225, 1393, 1917, 1993],
      dtype=int64)

In [31]:
tf_trans = TfidfTransformer()
reviews_tf = tf_trans.fit_transform(reviews_cv)
reviews_tf.shape

(2000, 13549)

In [32]:
tf = TfidfVectorizer(token_pattern="[\w']{2,}", min_df=5, max_df=0.5)
reviews_tf = tf.fit_transform(reviews)
reviews_tf.shape

(2000, 13549)

In [33]:
source_tf = tf.transform([source])
sim_result_tf = cosine_similarity(source_tf, reviews_tf)

In [36]:
sim_result_tf[0].argsort()[:-11:-1]

array([   0, 1570,  420, 1846, 1642,  225, 1993, 1917,  323, 1913],
      dtype=int64)