# Read Data

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt

In [2]:
ratings_train = pd.read_csv('ratings_train.txt', sep = "\t", engine='python')
ratings_test = pd.read_csv('ratings_test.txt', sep = "\t", engine='python')

In [3]:
print('ratings_train :',len(ratings_train))
print('ratings_test :',len(ratings_test))

ratings_train : 150000
ratings_test : 50000


# 전처리

## 중복 제거

In [4]:
ratings_train['document'].nunique(), ratings_train['label'].nunique()

(146182, 2)

In [5]:
ratings_train.drop_duplicates(subset=['document'], inplace=True) 

In [6]:
print('ratings_train :',len(ratings_train))

ratings_train : 146183


In [7]:
print(ratings_train.groupby('label').size().reset_index(name = 'count'))

   label  count
0      0  73342
1      1  72841


## nan값 확인

In [8]:
print(ratings_train.isnull().values.any())

True


In [9]:
print(ratings_train.isnull().sum())

id          0
document    1
label       0
dtype: int64


In [10]:
ratings_train.loc[ratings_train.document.isnull()]

Unnamed: 0,id,document,label
25857,2172111,,1


In [11]:
ratings_train = ratings_train.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(ratings_train.isnull().values.any()) # Null 값이 존재하는지 확인

False


In [12]:
print(len(ratings_train))

146182


## 한글과 공백을 제외한 것 모두 제거
### 1) Train

In [13]:
ratings_train['document'] = ratings_train['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")

In [14]:
ratings_train['document'].replace('', np.nan, inplace=True)
print(ratings_train.isnull().sum())

id            0
document    391
label         0
dtype: int64


In [15]:
ratings_train = ratings_train.dropna(how = 'any')
print(len(ratings_train))

145791


In [16]:
ratings_train[ratings_train['document'].str.isspace()]

Unnamed: 0,id,document,label
404,4221289,,0
412,9509970,,1
470,10147571,,1
1312,5831045,,0
1549,7246718,,1
...,...,...,...
148549,9715918,,1
148566,10110521,,0
149309,6715725,,1
149630,3508604,,0


In [17]:
ratings_train = ratings_train.drop(ratings_train[ratings_train['document'].str.isspace()].index)

### 2) Test

In [18]:
ratings_test.drop_duplicates(subset = ['document'], inplace=True) # document 열에서 중복인 내용이 있다면 중복 제거
ratings_test['document'] = ratings_test['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행
ratings_test['document'].replace('', np.nan, inplace=True) # 공백은 Null 값으로 변경
ratings_test = ratings_test.dropna(how='any') # Null 값 제거
print('ratings_test :',len(ratings_test))

ratings_test : 48995


In [19]:
ratings_test = ratings_test.drop(ratings_test[ratings_test['document'].str.isspace()].index)

In [20]:
print('ratings_test :',len(ratings_test))

ratings_test : 48852


# stopwords 제거 X

In [21]:
okt = Okt()

In [22]:
X_train = []
for sen in ratings_train['document']:
    X = []
    X = okt.morphs(sen, stem=True) # 토큰화
    X_train.append(X)

In [23]:
X_test = []
for sen in ratings_test['document']:
    X = []
    X = okt.morphs(sen, stem=True) # 토큰화
    X_test.append(X)

## 나이브베이즈

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer

In [25]:
X_train_data = []
for i in X_train:
    t = " ".join(i)
    X_train_data.append(t)
    
X_test_data = []
for i in X_test:
    t = " ".join(i)
    X_test_data.append(t)

In [26]:
cv = CountVectorizer()
# DTM
x_train_cv = cv.fit_transform(X_train_data) # X_train 각 단어의 빈도 수를 기록한다.

# TF-IDF matrix 
# count vectorizer for other tasks, use TFIDFTransformer
tfidf_transformer = TfidfTransformer()

tf_train = tfidf_transformer.fit_transform(x_train_cv)

x_test_cv = cv.transform(X_test_data) # X_test 각 단어의 빈도 수를 기록한다.

tf_test = tfidf_transformer.transform(x_test_cv)

In [27]:
print('Tfidf_train:',tf_train.shape)
print('Tfidf_test:',tf_test.shape)

Tfidf_train: (145393, 42120)
Tfidf_test: (48852, 42120)


In [28]:
tf_train = tf_train.toarray()
tf_test = tf_test.toarray()

In [29]:
Y_train = ratings_train.label
Y_test = ratings_test.label

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
nb = model.fit(tf_train, Y_train)

pred = nb.predict(tf_test)
print("정확도:", accuracy_score(Y_test, pred))

정확도: 0.8269057561614673


In [None]:
ac = []

for i in range(0,5):
    ac.append(accuracy_score(Y_test, pred))

In [None]:
print(sum(ac)/len(ac))

## LSTM

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [33]:
thres = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    if(value < thres):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

In [34]:
vocab_size = total_cnt - rare_cnt + 2

tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
tokenizer.fit_on_texts(X_train)


X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [35]:
drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

In [36]:
X_train = np.delete(X_train, drop_train, axis=0)
Y_train = np.delete(Y_train, drop_train, axis=0)

In [37]:
X_train = pad_sequences(X_train, maxlen = 78)
X_test = pad_sequences(X_test, maxlen = 78)

In [38]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [39]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [40]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [41]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, Y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

Epoch 1/15
Epoch 00001: val_acc improved from -inf to 0.84635, saving model to best_model.h5
Epoch 2/15
Epoch 00002: val_acc improved from 0.84635 to 0.85522, saving model to best_model.h5
Epoch 3/15
Epoch 00003: val_acc improved from 0.85522 to 0.85722, saving model to best_model.h5
Epoch 4/15
Epoch 00004: val_acc improved from 0.85722 to 0.86031, saving model to best_model.h5
Epoch 5/15
Epoch 00005: val_acc did not improve from 0.86031
Epoch 6/15
Epoch 00006: val_acc improved from 0.86031 to 0.86035, saving model to best_model.h5
Epoch 7/15
Epoch 00007: val_acc improved from 0.86035 to 0.86268, saving model to best_model.h5
Epoch 8/15
Epoch 00008: val_acc did not improve from 0.86268
Epoch 9/15
Epoch 00009: val_acc did not improve from 0.86268
Epoch 00009: early stopping


In [42]:
loaded_model = load_model('best_model.h5')

In [43]:
ac = []

for i in range(0,5):
    ac.append(loaded_model.evaluate(X_test, Y_test)[1])



In [44]:
print(sum(ac)/len(ac))

0.8590641021728516


# Tokenization - stopwords 제거

## 나이브베이즈

In [47]:
stopwords = pd.read_csv('stopwords.txt', sep = "\t", engine='python')

In [48]:
X_train = []
for sentence in ratings_train['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_train.append(temp_X)

In [49]:
X_test = []
for sentence in ratings_test['document']:
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    X_test.append(temp_X)

In [50]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer

In [51]:
X_train_data = []
for i in X_train:
    temp = " ".join(i)
    X_train_data.append(temp)
    
X_test_data = []
for i in X_test:
    temp = " ".join(i)
    X_test_data.append(temp)

In [52]:
cv = CountVectorizer()
# DTM
x_train_cv = cv.fit_transform(X_train_data) # X_train 각 단어의 빈도 수를 기록한다.

# TF-IDF matrix 
# count vectorizer for other tasks, use TFIDFTransformer
tfidf_transformer = TfidfTransformer()

tf_train = tfidf_transformer.fit_transform(x_train_cv)

x_test_cv = cv.transform(X_test_data) # X_test 각 단어의 빈도 수를 기록한다.

tf_test = tfidf_transformer.transform(x_test_cv)

In [53]:
tf_train = tf_train.toarray()
tf_test = tf_test.toarray()

In [54]:
Y_train = ratings_train.label
Y_test = ratings_test.label

In [55]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
nb = model.fit(tf_train, Y_train)

pred = nb.predict(tf_test)
print("정확도:", accuracy_score(Y_test, pred))

정확도: 0.8269057561614673


In [56]:
ac = []

for i in range(0,5):
    ac.append(accuracy_score(Y_test, pred))

In [57]:
print(sum(ac)/len(ac))

0.8269057561614673


## LSTM

In [58]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [59]:
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

In [60]:
vocab_size = total_cnt - rare_cnt + 2

tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
tokenizer.fit_on_texts(X_train)


X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [61]:
drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1]

In [62]:
X_train = np.delete(X_train, drop_train, axis=0)
Y_train = np.delete(Y_train, drop_train, axis=0)

In [63]:
print('리뷰의 최대 길이 :',max(len(l) for l in X_train))

리뷰의 최대 길이 : 78


In [65]:
X_train = pad_sequences(X_train, maxlen = 78)
X_test = pad_sequences(X_test, maxlen = 78)

In [66]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [67]:
model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [68]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [69]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, Y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

Epoch 1/15
Epoch 00001: val_acc improved from -inf to 0.84463, saving model to best_model.h5
Epoch 2/15
Epoch 00002: val_acc improved from 0.84463 to 0.85515, saving model to best_model.h5
Epoch 3/15
Epoch 00003: val_acc improved from 0.85515 to 0.85962, saving model to best_model.h5
Epoch 4/15
Epoch 00004: val_acc improved from 0.85962 to 0.86141, saving model to best_model.h5
Epoch 5/15
Epoch 00005: val_acc did not improve from 0.86141
Epoch 6/15
Epoch 00006: val_acc improved from 0.86141 to 0.86152, saving model to best_model.h5
Epoch 7/15
Epoch 00007: val_acc did not improve from 0.86152
Epoch 8/15
Epoch 00008: val_acc did not improve from 0.86152
Epoch 00008: early stopping


In [70]:
for i in range(0,5):
    loaded_model = load_model('best_model.h5')
    print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, Y_test)[1]))


 테스트 정확도: 0.8591

 테스트 정확도: 0.8591

 테스트 정확도: 0.8591

 테스트 정확도: 0.8591

 테스트 정확도: 0.8591


In [71]:
ac = []

for i in range(0,5):
    ac.append(loaded_model.evaluate(X_test, Y_test)[1])



In [72]:
print(sum(ac)/len(ac))

0.8591050505638123
