In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('2016_filtered_review.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t\t') for doc in f]
    docs = [(doc[1], int(doc[2])) for doc in docs if len(doc) == 3]
    # To read the second and third column info from each row
    texts, scores = zip(*docs[:20000])
    # 둘을 분리해서 별도의 list 변수로 저장

In [3]:
filtered_texts = []
filtered_labels = []

for text, score in zip(texts, scores):
    if 5 <= score <= 9:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 3 -> 부정, 0
    # 9 ~ 10 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 9 else 0)

In [4]:
sum(filtered_labels)/len(filtered_texts)

0.8294408520349943

In [6]:
filtered_texts[0]

' 진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임'

In [7]:
filtered_texts[:2]

[' 진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임',
 ' 역대 좀비 영화 가장 최고다 원작 만화 읽어 보려 영화 보고 결정 하려 감독 간츠 실사 했 사람 거르려 그냥 봤 정말 흠잡 없는 최고 좀비 영화 잔인 거 싫어하지 참고 볼 만하 로미 인물 왜 그런 모르']

In [5]:
filtered_words = [doc.strip().split() for doc in filtered_texts]

In [6]:
total_words = []
for words in filtered_words:
    total_words.extend(words)

In [7]:
print(len(total_words))
print(len(set(total_words)))

153375
13909


In [9]:
from collections import Counter
c = Counter(total_words)

In [10]:
max_features = 5000 # 빈도수를 기준으로 상위 5000개의 단어만 사용
common_words = [ word for word, count in c.most_common(max_features)]

In [11]:
len(common_words)

5000

In [12]:
print(common_words)

['영화', '너무', '좋', '봤', '보고', '정말', '연기', '감동', '배우', '진짜', '대통령', '였', '사람', '했', '입니', '것', '먹', '그', '더', '노무현', '이', '그립', '눈물', '보는', '잘', '수', '생각', '가슴', '하는', '분', '마음', '꼭', '봐', '현실', '알', '이런', '최고', '볼', '우리', '때', '있는', '다시', '본', '할', '스토리', '역사', '나', '말', '같은', '좀비', '그냥', '아니', '같아', '보면', '없는', '여운', '시간', '평점', '또', '청춘', '내', '역시', '한번', '울었', '지금', '이었', '많은', '살', '재미', '어른', '점', '이야기', '순정', '있었', '마지막', '내용', '들', '아이', '왜', '한', '친구', '처음', '되', '짱구', '내내', '울', '하게', '재밌게', '된', '슬프', '왔', '공유', '그런', '참', '사랑', '모습', '되는', '장면', '함', '기대', '재밌었', '원피스', '같다', '거', '보러', '저', '국민', '보세', '극장판', '하나', '있', '모르', '좀', '합', '인간', '넘', '손예진', '느낌', '와', '정도', '감사합', '모두', '끝', '인생', '일본', '감정', '당신', '감독', '너무나', '않고', '안', '돈', '덕혜옹주', '재밌어', '아닌', '같', '짱', '듯', '요즘', '세상', '일', '나라', '전도연', '속', '시대', '보기', '부산', '남', '순수한', '계속', '못', '펑펑', '완전', '잼', '없다', '다른', '요', '오랜만', '재미있었', '상영', '하지', '때문', '중간', '이상', '여자', '없었', '있어', '도경수', '않았', '보여', '갔', '걸', '

In [13]:
words_dic ={}  # 각 단어에 index 번호 부여
words_index_dic={} # index 번호가 key가 됨, value는 단어
for index, word in enumerate(common_words):
    words_dic[word]=index
    words_index_dic[index]=word

In [14]:
filtered_indexed_words = [] # index 번호가 부여된 단어들만을 사용해서 각 문서를 표현
for review in filtered_words:
    indexed_words=[]
    for word in review:
        try:
            indexed_words.append(words_dic[word])
        except: 
            pass
    filtered_indexed_words.append(indexed_words)

In [15]:
filtered_indexed_words[0] # 첫번째 영화 리뷰에 사용된 단어들의 인덱스

[227, 349, 0, 319, 34, 225, 182, 1050, 811, 54, 349, 0, 185]

In [16]:
# 어떠한 단어들인지 확인
[words_index_dic[index] for index in filtered_indexed_words[0]]

['진심', '쓰레기', '영화', '만들', '알', '이건', '뭐', '웃', '거리', '없는', '쓰레기', '영화', '임']

In [19]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(filtered_indexed_words, filtered_labels, test_size=0.2)

In [17]:
import numpy as np

def vectorize_sequences(sequences, dimension=max_features):
    # Create an all-zero matrix of shape (len(sequences), dimension)
    # len(sequences) => number of documents
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences): # sequence => 단어들의 index로 구성되어 있는 리스트
        results[i, sequence] = 1.  # set specific indices of results[i] to 1s
    return results

In [28]:
X_train_indexed = vectorize_sequences(X_train)
X_test_indexed = vectorize_sequences(X_test)

In [29]:
X_train_indexed[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [30]:
len(X_train_indexed[0])

5000

In [24]:
X_train_indexed.shape

(10516, 5000)

In [25]:
y_train[0]

1

In [31]:
from tensorflow.keras.utils import to_categorical
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)

In [32]:
y_train_one_hot[0]

array([0., 1.], dtype=float32)

In [32]:
from tensorflow.keras import models
from tensorflow.keras import layers

model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(max_features,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))

In [33]:
from tensorflow.keras.optimizers import RMSprop
model.compile(optimizer=RMSprop(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [34]:
X_train_indexed.shape

(10516, 5000)

In [35]:
history = model.fit(X_train_indexed, y_train_one_hot, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
test_loss, test_acc = model.evaluate(X_test_indexed, y_test_one_hot)
print('test_acc:', test_acc)

test_acc: 0.9140357375144958


In [28]:
len(X_test[0])

16

In [34]:
model.predict(X_test_indexed)[:40]

array([[7.75632322e-01, 2.24367633e-01],
       [9.37626045e-03, 9.90623772e-01],
       [3.70502597e-08, 1.00000000e+00],
       [5.32671763e-03, 9.94673312e-01],
       [3.86480824e-03, 9.96135235e-01],
       [2.69148018e-06, 9.99997258e-01],
       [2.63371654e-02, 9.73662853e-01],
       [2.51541939e-02, 9.74845827e-01],
       [3.09321168e-03, 9.96906817e-01],
       [3.72912854e-01, 6.27087176e-01],
       [8.64512622e-01, 1.35487333e-01],
       [2.50578509e-04, 9.99749362e-01],
       [1.21169156e-04, 9.99878764e-01],
       [1.91863105e-02, 9.80813682e-01],
       [6.58653909e-03, 9.93413508e-01],
       [8.70946273e-02, 9.12905395e-01],
       [1.18157901e-01, 8.81842136e-01],
       [9.17396069e-01, 8.26039836e-02],
       [6.22274383e-05, 9.99937773e-01],
       [1.28223617e-02, 9.87177610e-01],
       [1.69623315e-01, 8.30376685e-01],
       [3.48543283e-04, 9.99651432e-01],
       [4.19936939e-08, 1.00000000e+00],
       [8.44086893e-03, 9.91559148e-01],
       [2.141427

In [30]:
c = 0
for k in model.predict(X_test_indexed):
    if k[0]>0.5:
        print(c, k)
    c=c+1

3 [0.9047961  0.09520394]
6 [0.5247267  0.47527328]
10 [0.6331061  0.36689386]
12 [0.62674105 0.37325892]
13 [0.7420787  0.25792122]
16 [0.6575557  0.34244433]
43 [0.778568   0.22143193]
59 [0.8863832  0.11361685]
71 [0.7523109 0.2476891]
80 [0.5487035 0.4512965]
84 [0.6115333  0.38846675]
93 [0.8807358  0.11926413]
99 [0.5061654  0.49383458]
113 [0.6199672 0.3800328]
116 [0.56640685 0.4335931 ]
123 [0.6748398  0.32516024]
127 [0.61390644 0.38609356]
139 [0.6701604  0.32983953]
140 [0.8101536  0.18984644]
141 [0.50548345 0.49451655]
142 [0.6648758 0.3351242]
146 [0.62297606 0.377024  ]
147 [0.79124606 0.20875391]
154 [0.8020519  0.19794813]
156 [0.63310015 0.36689985]
163 [0.67738444 0.32261556]
167 [0.52910376 0.4708962 ]
169 [0.5600746  0.43992537]
173 [0.86147475 0.13852525]
181 [0.70006937 0.29993063]
196 [0.58235735 0.41764268]
200 [0.53249425 0.4675057 ]
202 [0.54691887 0.45308113]
206 [0.91363126 0.08636871]
207 [0.76142865 0.23857136]
216 [0.76880515 0.23119493]
218 [0.8315142 

In [35]:
y_test[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]

In [36]:
y_test_one_hot[0]

array([0., 1.], dtype=float32)

In [23]:
words_dic_reverse = {}
for key in words_dic:
    words_dic_reverse[words_dic[key]]=key

In [24]:
words_dic_reverse[0]

'영화'

In [25]:
# Test data의 첫번째 리뷰
for index in X_test[0]:
    print(words_dic_reverse[index])

좀비
특성
인물
묘사
연출
스토리
모두
부산
행
배
됨
진짜
재밌다


In [40]:
for index in X_test[4]:
    print(words_dic_reverse[index])

감사합
너무
잘
봤


In [26]:
for index in X_test[0]:
    print(words_index_dic[index])

좀비
특성
인물
묘사
연출
스토리
모두
부산
행
배
됨
진짜
재밌다
