# 감성분석 예제  2: 영화 감상평  예제


## 

In [None]:
import numpy as np 
import pandas as pd 
# import os

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from bs4 import BeautifulSoup
import re


from tqdm import tqdm


In [None]:
from tensorflow.keras.utils import to_categorical
import random
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Bidirectional


tf.random.set_seed(123)

random.seed(123)

## 

In [None]:
train= pd.read_csv("train.tsv", sep="\t")  # tsv: tab seperated value

train.head()

In [None]:
train.shape

# 

In [None]:

def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['Phrase']):
        
        #remove html content
        # review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", sent)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)



In [None]:
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet'

In [None]:

train_sentences = clean_sentences(train)
# test_sentences = clean_sentences(test)
print(len(train_sentences))
# print(len(test_sentences))

In [None]:
target=train.Sentiment.values
print(target,target.shape)       # (156060,)
y_target=to_categorical(target)  # one-hot 벡터로 변환
print(y_target,y_target.shape)   # (156060, 5)
num_classes=y_target.shape[1]
print('num_classes:',num_classes)

#

In [None]:
X_train,X_val,y_train,y_val=train_test_split(train_sentences,y_target,test_size=0.2,stratify=y_target)

In [None]:
X_train

## 

In [None]:
#

unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    
    unique_words.update(sent)   # 집합에 넣어 중복 삭제
    
    if(len_max<len(sent)):  
        len_max = len(sent)  # 가장 긴 문장의 길이를 구한다
        
#
print(len(list(unique_words)))
print(len_max)

In [None]:
list(unique_words)[-1] # 마지막 단어

## 토큰화

In [None]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

#texts_to_sequences(texts)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
# X_test = tokenizer.texts_to_sequences(test_sentences)


X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_val = sequence.pad_sequences(X_val, maxlen=len_max)
# X_test = sequence.pad_sequences(X_test, maxlen=len_max)

print(X_train.shape,X_val.shape)

## 학습 모델 구현

In [None]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'max', monitor='val_acc', patience = 2)
callback = [early_stopping]

In [None]:
model = Sequential([
    Embedding(len(list(unique_words)),300,input_length=len_max),
    LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True),
    LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False),
    Dense(100,activation='relu'),
    Dropout(0.5),
    Dense(num_classes,activation='softmax')
    ])

model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.005),metrics=['accuracy'])
model.summary()

In [None]:

history=model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=3, batch_size=256, verbose=1,
                  callbacks=callback)

In [None]:
import matplotlib.pyplot as plt

epoch_count = range(1, len(history.history['loss']) + 1)

plt.plot(epoch_count, history.history['loss'], 'r--')
plt.plot(epoch_count, history.history['val_loss'], 'b-')
plt.legend(['Training Loss', 'Validation Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

### 정확도 측정