In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import urllib.request
import mecab
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv('./lable18_22.csv')
df.head()

Unnamed: 0,labels,kor_sentence,tokenized
0,1.0,[ET투자뉴스]CMG제약_기관의 힘? 대량순매수 이후.. 현재 +3.05%,"['[', 'ET', '투자', '뉴스', ']', 'CMG', '제약', '_',..."
1,1.0,[한경로보뉴스] '와이지엔터테인먼트' 52주 신고가 경신,"['[', '한경', '로보', '뉴스', ']', ""'"", '와', '이지', '..."
2,1.0,"CMG제약(058820) 종목알파고 분석, 외국인/기관 실시간 수급과 추가 매수 확률은?","['CMG', '제약', '(', '058820', ')', '종목', '알파', ..."
3,0.0,"CMG제약, 전일 대비 약 -4% 하락한 4,515원","['CMG', '제약', ',', '전일', '대비', '약', '-', '4', ..."
4,1.0,[Hot Stock] CMG제약,"['[', 'Hot', 'Stock', ']', 'CMG', '제약']"


In [3]:
print(f'중립의 비율 = {round(df["labels"].value_counts()[0]/len(df) * 100,3)}%')
print(f'긍정의 비율 = {round(df["labels"].value_counts()[1]/len(df) * 100,3)}%')
print(f'부정의 비율 = {round(df["labels"].value_counts()[2]/len(df) * 100,3)}%')

중립의 비율 = 38.044%
긍정의 비율 = 51.086%
부정의 비율 = 10.87%


# 다시 tokenized

In [None]:
df = df.drop('tokenized',axis=1)
df.head(2)

In [18]:
mecab = mecab.MeCab()
df['tokenized'] = df['kor_sentence'].apply(mecab.morphs)

# 불용어 처리

In [4]:
stop_word = pd.read_csv('./stop_word_1.csv')
stop_word.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   word    1313 non-null   object
dtypes: object(1)
memory usage: 10.4+ KB


In [None]:
tokenized_1 = []
for row in df['tokenized']:
    print(row)
    tmp =[]
    for item in row:
        print(item)
        if item not in stop_word:
            tmp.append(item)
    tokenized_1.append(tmp)

In [26]:
len(tokenized_1)

41307

In [28]:
# tokenized_1[:5]

In [None]:
df['tokenized'][0][10:20]

In [None]:
print(tokenized_1)

In [None]:
len(tokenized_1), len(df)

In [None]:
df['tokenized_1'] = tokenized_1

In [None]:
df['tokenized_1'].head(2)

# 훈련 - 테스트

In [None]:
X = df['tokenized']
y = df['labels']
print('본문의 개수: {}'.format(len(X)))
print('레이블의 개수: {}'.format(len(y)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# tokenize

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_encoded = tokenizer.texts_to_sequences(X_train)
X_test_encoded = tokenizer.texts_to_sequences(X_test)
print(X_train_encoded[:5])
print(X_test_encoded[:5])

In [None]:
word_to_index = tokenizer.word_index
vocab_size = len(word_to_index) + 1
print(vocab_size)

## 패딩

In [None]:
X_train[:2]

In [None]:
print(  '본문의 최대 길이 :',   max(   len(sent) for sent in X_train   )   )
# X_train 요소 하나하나 길이 재서 최대길이추출
print(   '본문의 평균 길이 :',  sum(    map(len, X_train)  )   /  len(X_train)     )
# X_train 요소 길이 재서 sum하구 X_train 개수로 나눈다
plt.hist([len(sent) for sent in X_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
max_len = 76

In [None]:
X_train_encoded = pad_sequences(X_train_encoded, maxlen=max_len)
X_test_encoded = pad_sequences(X_test_encoded, maxlen=max_len)

In [None]:
X_train_encoded.shape

In [None]:
X_test_encoded.shape

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
X_test[:1]

In [None]:
X_train_encoded[0]

In [None]:
X_train_encoded.shape

In [None]:
y_test[8]

In [None]:
y_train.shape

In [None]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
y_train.shape

 * 테스트 정확도: 0.8574

In [None]:
embedding_dim = 64
hidden_units = 64
num_classes = 3

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(num_classes, activation='softmax'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('./NLP/3.best_model_18_22.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
history = model.fit(X_train_encoded, y_train, epochs=15, callbacks=[es, mc], batch_size=32, validation_split=0.2)

In [None]:
loaded_model = load_model('./NLP/3.best_model_18_22.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test_encoded, y_test)[1]))

In [None]:
loaded_model