### 데이터 출처
https://www.kaggle.com/vivekrathi055/sentiment-analysis-on-financial-tweets

In [4]:
import pandas as pd
import tensorflow as tf

### 드라이브 데이터 다운로드

In [1]:
# valid 
!gdown --id 1EBXe8-U5OnDMNbgMRcIDygJzdTKOtEA0
# train
!gdown --id 1rLFoEejWhc_S2bTEHy7CoDc5jBTpbIBe

Downloading...
From: https://drive.google.com/uc?id=1EBXe8-U5OnDMNbgMRcIDygJzdTKOtEA0
To: /content/valid.csv
100% 31.5k/31.5k [00:00<00:00, 54.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rLFoEejWhc_S2bTEHy7CoDc5jBTpbIBe
To: /content/train.csv
100% 124k/124k [00:00<00:00, 44.2MB/s]


### 데이터 형태 확인

- 0 : 부정
- 1 : 중립
- 2 : 긍정

In [5]:
train = pd.read_csv('train.csv', header = None)

train[:5]

Unnamed: 0,0,1
0,critic survey ashford hospit prime ahp amp kim...,0
1,analyst adopt bullish outlook robert half inte...,1
2,zack rank strong buy semiconductor stock mlnx ...,2
3,setup like watch wed roku iq sfix shop spot ua...,2
4,invesco ivz price target lower credit suiss group,1


### 데이터 기반 vocabulary 생성

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [23]:
tokenizer = Tokenizer(num_words=1000,oov_token='<OOV>')
tokenizer.fit_on_texts(train.iloc[:,0])

In [24]:
vocab = list(tokenizer.word_docs)[:1000]

### keras 모듈 임포트

In [36]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, Bidirectional, LSTM
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

### 데이터 불러오기 

In [10]:
column_names = ['text', 'label']
column_defaults = ['string', 'int32']
root = './'
train_path = root + 'train.csv'
valid_path = root + 'valid.csv'

# train data set
train_dataset = tf.data.experimental.make_csv_dataset(train_path, column_defaults=column_defaults,
                                                      column_names=column_names, label_name='label',
                                                      batch_size=320, header=False, num_epochs=1)

# valid data set
valid_dataset = tf.data.experimental.make_csv_dataset(valid_path, column_defaults=column_defaults,
                                                      column_names=column_names, label_name='label',
                                                      batch_size=320, header=False, num_epochs=1)

#### map 전

In [None]:
next(train_dataset.as_numpy_iterator())

#### map 후

In [11]:
train_dataset = train_dataset.map(lambda text, label:(text['text'], label))
valid_dataset = valid_dataset.map(lambda text, label:(text['text'], label))

In [None]:
next(train_dataset.as_numpy_iterator())

### Train

In [38]:
# 단어를 벡터로 바꾸는 encoder
encoder = TextVectorization(vocabulary=vocab, output_sequence_length=200)

# RNN
# 단어 -> 인코더 -> 임베딩 => 양방향 RNN -> dence -> dence 구조
model = Sequential([
    encoder,
    Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=300, mask_zero=True),
    Bidirectional(LSTM(300)),
    Dense(300, activation='relu'),
    # class가 3개기 때문에 마지막 layer는 3
    Dense(3)
])

# loss 함수 선택도 매우 중요 !
model.compile(loss = SparseCategoricalCrossentropy(from_logits=True),
              optimizer = 'Adam',
              metrics = ['accuracy'])


history = model.fit(train_dataset, epochs = 20, validation_data = valid_dataset, use_multiprocessing=True, workers=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### test 데이터 확인

In [39]:
!gdown --id 1ugaRfNbetYH2dxrS8cB5KR07s1kCBPG1

Downloading...
From: https://drive.google.com/uc?id=1ugaRfNbetYH2dxrS8cB5KR07s1kCBPG1
To: /content/test.csv
  0% 0.00/168k [00:00<?, ?B/s]100% 168k/168k [00:00<00:00, 51.0MB/s]


In [41]:
test_path = root + 'test.csv'

test_dataset = tf.data.experimental.make_csv_dataset(test_path, column_defaults=column_defaults,
                                                     column_names = column_names, header=False,
                                                     num_epochs = 1, batch_size = 32, label_name = 'label')

In [43]:
test_dataset = test_dataset.map(lambda text, label:(text['text'], label))

In [44]:
loss, acc = model.evaluate(test_dataset)
print('test loss : {}\ntest acc : {}'.format(loss, acc))

test loss : 1.1487869024276733
test acc : 0.7905505895614624
