<a href="https://colab.research.google.com/github/lkjh1121/Colab/blob/master/Ch04.%ED%85%8D%EC%8A%A4%ED%8A%B8%20%EB%A7%88%EC%9D%B4%EB%8B%9D%20%EC%8B%A4%EC%8A%B5/2_%EC%98%81%ED%99%94_%EB%A6%AC%EB%B7%B0_%ED%85%8D%EC%8A%A4%ED%8A%B8_%EB%B6%84%EC%84%9D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install konlpy



In [2]:
"""
날짜 : 2022/05/17
이름 : 김재현
내용 : 영화 리뷰 텍스트 분석 실습
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle, re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, Dropout, Embedding, LSTM
from konlpy.tag import Okt

In [3]:
#텍스트 데이터 불러오기
train_data = pd.read_table('/content/drive/MyDrive/파이썬 데이터 과학실습/file/movie_review_train.txt')
test_data = pd.read_table('/content/drive/MyDrive/파이썬 데이터 과학실습/file/movie_review_train.txt')
train_data

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [4]:
######################################################
#텍스트 전처리
######################################################

#분석기 생성
okt = Okt()

#불용어 정의
stop_words = ['은', '는', '이', '가', '하', '것', '들', '의', '있', '되', '등', '한']

#전처리 함수 정의
def preprocessing(txt):
  # 한글 텍스트가 아닌 문장 제거
  text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", str(txt))

  # 문장 형태소 분석
  text = okt.morphs(text, stem=True)
  
  # 불용어 제거
  word_text = [w for w in text if not w in stop_words]

  return word_text

result1 = preprocessing('아 더빙.. 진짜 짜증나네요 목소리');
result2 = preprocessing('흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나');

print('result1 :', result1)
print('result2 :', result2)

result1 : ['아', '더빙', '진짜', '짜증나다', '목소리']
result2 : ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다']


In [5]:
#훈련, 검증 데이터 전처리
train_txt = []
test_txt = []

for txt in train_data['document']:
  train_txt.append(preprocessing(txt))

for txt in test_data['document']:
  test_txt.append(preprocessing(txt))  

train_txt[:4]

[['아', '더빙', '진짜', '짜증나다', '목소리'],
 ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다'],
 ['너', '무재', '밓었', '다그', '래서', '보다', '추천', '다'],
 ['교도소', '이야기', '구먼', '솔직하다', '재미', '없다', '평점', '조정']]

In [6]:
#토큰화
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_txt)

train_sequences = tokenizer.texts_to_sequences(train_txt)
test_sequences = tokenizer.texts_to_sequences(test_txt)

word_index = tokenizer.word_index
word_index

#토큰 저장
with open('/content/drive/MyDrive/파이썬 데이터 과학실습/file/movie_tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
#패딩처리
train_padded = pad_sequences(train_sequences, maxlen=10, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=10, padding='post')

train_label = np.array(train_data['label'])
test_label = np.array(test_data['label'])

train_padded[0]

array([ 56, 466,  20, 267, 668,   0,   0,   0,   0,   0], dtype=int32)

In [8]:
#모델생성
model = Sequential()
model.add(Embedding(30000, 10))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 10)          300000    
                                                                 
 lstm (LSTM)                 (None, 128)               71168     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 371,297
Trainable params: 371,297
Non-trainable params: 0
_________________________________________________________________


In [9]:
#모델설정
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [10]:
#모델학습
model.fit(train_padded, train_label, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10


InvalidArgumentError: ignored

In [12]:
#모델평가
result = model.evaluate(test_padded, test_label)
result



InvalidArgumentError: ignored

In [11]:
#모델저장
model.save('/content/drive/MyDrive/파이썬 데이터 과학실습/file/movie_model.h5')

In [13]:
#모델 테스트 함수
def sentiment_predict(s):
  s = re.sub("[^가-힣ㅏ-ㅣㄱ-ㅎ]", "", s)
  
  s = okt.morphs(s, stem=True)
  s = [word for word in s if not word in stop_words]

  sequences = tokenizer.texts_to_sequences([s])
  padded = pad_sequences(sequences, maxlen=10, padding='post')

  score = float(model.predict(padded))
  return score

In [None]:
while True:
  txt = input('입력 :')
  score = sentiment_predict(txt)

  if score > 0.5:
    print("{:.2f} 확률로 긍정입니다.\n".format(score * 100))
  else:
    print("{:.2f} 확률로 부정입니다.\n".format((1 - score) * 100))

입력 :5
50.24 확률로 긍정입니다.

