#### **데이터 다운로드**

In [1]:
# valid
!gdown --id 1EBXe8-U5OnDMNbgMRcIDygJzdTKOtEA0

# train
!gdown --id 1rLFoEejWhc_S2bTEHy7CoDc5jBTpbIBe

# test
!gdown --id 1ugaRfNbetYH2dxrS8cB5KR07s1kCBPG1

Downloading...
From: https://drive.google.com/uc?id=1EBXe8-U5OnDMNbgMRcIDygJzdTKOtEA0
To: /content/valid.csv
100% 31.5k/31.5k [00:00<00:00, 4.68MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rLFoEejWhc_S2bTEHy7CoDc5jBTpbIBe
To: /content/train.csv
100% 124k/124k [00:00<00:00, 4.01MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ugaRfNbetYH2dxrS8cB5KR07s1kCBPG1
To: /content/test.csv
100% 168k/168k [00:00<00:00, 5.33MB/s]


#### **데이터 형태 확인**

In [2]:
import pandas as pd

In [3]:
train = pd.read_csv('train.csv', header = None)

train[:5]

Unnamed: 0,0,1
0,critic survey ashford hospit prime ahp amp kim...,0
1,analyst adopt bullish outlook robert half inte...,1
2,zack rank strong buy semiconductor stock mlnx ...,2
3,setup like watch wed roku iq sfix shop spot ua...,2
4,invesco ivz price target lower credit suiss group,1


In [4]:
!pip install torchtext==0.8.0

Collecting torchtext==0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/26/8a/e09b9b82d4dd676f17aa681003a7533765346744391966dec0d5dba03ee4/torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9MB)
[K     |████████████████████████████████| 7.0MB 16.2MB/s 
Installing collected packages: torchtext
  Found existing installation: torchtext 0.10.0
    Uninstalling torchtext-0.10.0:
      Successfully uninstalled torchtext-0.10.0
Successfully installed torchtext-0.8.0


#### **모듈 임포트**

In [6]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim 
from torchtext.data import Field, TabularDataset, BucketIterator

from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize

import os
import nltk

In [7]:
# nltk tokenizer 사용
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
# 데이터 output root 설정 및 gpu설정
output_file_path = './model/'
os.makedirs(output_file_path, exist_ok= True)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# random seed 고정
torch.manual_seed(777)
if device == 'cuda':
  torch.cuda.manual_seed_all(777)

#### **필드 정의**
필드를 통해 앞으로 어떤 전처리를 할 것인지 정의 <br>
label과 text는 각각 정의

In [9]:
# 텍스트 데이터 필드 정의
text_field = Field(tokenize=word_tokenize, lower=True, include_lengths=True, batch_first=True)
# 라벨 데이터 필드 정의
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.long)

fields = [('text', text_field), ('label', label_field)]



#### **데이터 불러오기**

In [10]:
# train, valid 데이터 불러오기
train, valid = TabularDataset.splits(path='./', train = 'train.csv', validation='valid.csv',
                                    format='csv',fields = fields, skip_header = True)



In [11]:
train.fields.items()

dict_items([('text', <torchtext.data.field.Field object at 0x7ff8ea465250>), ('label', <torchtext.data.field.Field object at 0x7ff8ea4652d0>)])

#### **데이터 로더로 만들기**

In [12]:
# train, valid 각각 iterater 데이터 로더로 만들기
# train
train_loader = BucketIterator(train, batch_size=32, sort_key=lambda x: len(x.text),
                              device = device, sort = True, sort_within_batch = True)

# valid
valid_loader = BucketIterator(valid, batch_size=32, sort_key=lambda x: len(x.text),
                              device = device, sort = True, sort_within_batch = True)



In [13]:
text_field.build_vocab(train, min_freq = 5)
vocab_size = len(text_field.vocab) # 1016개

In [14]:
next(iter(train_loader))




[torchtext.data.batch.Batch of size 32]
	[.text]:('[torch.cuda.LongTensor of size 32x5 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 32 (GPU 0)]

In [15]:
print(len(train))
print(train[0].text)

1999
['analyst', 'adopt', 'bullish', 'outlook', 'robert', 'half', 'intern', 'inc', 'rhi']


#### **LSTM Classifier 클래스 작성**

In [16]:
class LSTMClassifier(nn.Module):

  # pytorch로 모델 정의할 때 반드시 nn.Module을 상속받은 후 진행
  def __init__(self, vocab_size, dimension = 128):
    # 클래스의 첫 시작인 함수, 필요한 여러 변수 선언

    super(LSTMClassifier, self).__init__() # 필수 코드 ! 

    # LSTM에 필요한 변수 선언

    self.embedding = nn.Embedding(vocab_size, 300)
    self.dimension = dimension
    self.lstm = nn.LSTM(input_size=300, hidden_size=dimension, num_layers=1, batch_first = True, bidirectional=True)
    self.drop = nn.Dropout(p=0.5)

    self.fc = nn.Linear(2*dimension, 3) # 클래스가 3개라 3으로 끝나야 함 

  def forward(self, text, text_len):
    # 모델의 forward를 수행하는 함수
    # text, text_len을 변수로 받아 신경망 모델을 forward 방향으로 탈 때 출력을 반환
    # 단어 -> encoder -> embedding -> 양방향RNN (LSTM) -> dense -> dense 구조

    text_embedding = self.embedding(text) # embedding

    # text마다 길이가 다르기 때문에 padding으로 채워주기
    packed_input = pack_padded_sequence(text_embedding, text_len.cpu(), batch_first=True, enforce_sorted=False) 
    packed_output, _ = self.lstm(packed_input)
    output, _ = pad_packed_sequence(packed_output, batch_first=True) 

    out_forward = output[range(len(output)), text_len - 1, :self.dimension] # 잘 모르겠는데,,?
    out_reverse = output[:, 0, self.dimension: ] # 무슨 의미일까,,
    out_reduced = torch.cat((out_forward, out_reverse), 1)
    text_fea = self.drop(out_reduced)

    text_out = self.fc(text_fea)
    return text_out


#### **Train** 

In [17]:
# train 함수

def train(model, device, optimizer, train_loader, valid_loader, output_file_path, num_epochs = 5):
  # 학습에 필요한 변수 선언

  running_loss = 0.0
  global_step = 0
  global_step_list = []
  train_loss_list = []
  valid_loss_list = []
  loss_fn = nn.CrossEntropyLoss()
  best_valid_loss = float('Inf')
  eval_every = 10 # 무슨 용도지

  # 모델에게 학습이 진행됨을 알려줌
  model.train()

  for epoch in range(num_epochs):
    for ((text, text_len), labels), _ in train_loader:
      # Gpu로 사용할 수 있게 데이터 변환
      text = text.to(device)
      text_len = text_len.to(device)
      labels = labels.to(device)

      # model안에서 정의한 forward 진행
      output = model(text, text_len)

      # forward값과 실제의 차이를 loss로 받음
      loss = loss_fn(output, labels)

      # optimizer 수행
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      running_loss += loss.item()
      global_step +=1

      if global_step % eval_every == 0:
        # 10번에 한번 validation을 이용하여 성능 검증
        average_train_loss, average_valid_loss = evaluate(model, device, valid_loader, loss_fn,
                                                          running_loss, eval_every)
        
        # 검증이 끝난 뒤 모델에게 학습을 준비 시킴
        running_loss = 0.0
        model.train()

        # 결과 출력
        # print('text :{}\nlabel :{}\n'.format(text, labels))
        print('Epoch {}, Step {}, train_loss : {:.4f}, valid_loss :{:.4f}'.format(epoch + 1, global_step, average_train_loss, average_valid_loss))

        # 결과 저장
        train_loss_list.append(average_train_loss)
        valid_loss_list.append(average_valid_loss)
        global_step_list.append(global_step) # 이건 왜 저장하지


        # 기존보다 상태가 좋으면 저장
        if best_valid_loss > average_valid_loss:
          best_valid_loss = average_valid_loss
          save_checkpoint(output_file_path + '/model.pt', model, optimizer, best_valid_loss)
          save_metrics(output_file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_step_list)
      
  save_metrics(output_file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_step_list)

#### **Evaluate**

In [18]:
# train 안에서 사용할 evaluation 함수 (검증 함수)
def evaluate(model, device, valid_loader, loss_fn, running_loss, eval_every):

  # 모델한테 평가 중이라고 알림
  model.eval()
  valid_running_loss = 0.0

  # 학습이 아니기에 최적화를 하지 않는다는 환경을 설정 
  with torch.no_grad():
    for ((text, text_len), labels), _ in valid_loader:
      # GPU 태우기
      text = text.to(device)
      text_len = text_len.to(device)
      labels = labels.to(device)

      output = model(text, text_len)

      loss = loss_fn(output, labels)
      valid_running_loss += loss.item()

  average_train_loss = running_loss / eval_every
  average_valid_loss = valid_running_loss / eval_every

  return average_train_loss, average_valid_loss

#### **모델 저장을 위한 함수 정의**

In [19]:
def save_checkpoint(save_path, model, optimizer, valid_loss):
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}

    torch.save(state_dict, save_path)


def load_checkpoint(load_path, model, optimizer, device):
    state_dict = torch.load(load_path, map_location=device)

    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])

    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}

    torch.save(state_dict, save_path)


def load_metrics(load_path, device):
    state_dict = torch.load(load_path, map_location=device)

    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

#### **학습**

In [20]:
# LSTM 클래스의 인스턴스 생성
model = LSTMClassifier(vocab_size).to(device)

# optimizer 생성
optim = torch.optim.Adam(model.parameters(), lr = 0.001)

In [21]:
train(model, device, optim, train_loader, valid_loader, output_file_path, 4)



Epoch 1, Step 10, train_loss : 0.8570, valid_loss :1.6047
Epoch 1, Step 20, train_loss : 0.7993, valid_loss :1.5476
Epoch 1, Step 30, train_loss : 0.9827, valid_loss :1.3017
Epoch 1, Step 40, train_loss : 0.9044, valid_loss :1.3117
Epoch 1, Step 50, train_loss : 0.9006, valid_loss :1.3090
Epoch 1, Step 60, train_loss : 0.9551, valid_loss :1.2321
Epoch 2, Step 70, train_loss : 0.6485, valid_loss :1.1660
Epoch 2, Step 80, train_loss : 0.5493, valid_loss :1.2221
Epoch 2, Step 90, train_loss : 0.6844, valid_loss :1.1866
Epoch 2, Step 100, train_loss : 0.6831, valid_loss :1.0552
Epoch 2, Step 110, train_loss : 0.6923, valid_loss :1.0763
Epoch 2, Step 120, train_loss : 0.8114, valid_loss :1.1231
Epoch 3, Step 130, train_loss : 0.5964, valid_loss :0.9539
Epoch 3, Step 140, train_loss : 0.3872, valid_loss :0.9599
Epoch 3, Step 150, train_loss : 0.4356, valid_loss :0.9948
Epoch 3, Step 160, train_loss : 0.4016, valid_loss :0.8579
Epoch 3, Step 170, train_loss : 0.4602, valid_loss :0.8335
Epoch 

#### **Test**

In [22]:
def test(model, device, test_loader):
  y_pred = []
  y_true = []

  model.eval()
  with torch.no_grad():
    for ((text, text_len), labels), _ in test_loader:
        text = text.to(device)
        text_len = text_len.to(device)
        labels = labels.to(device)

        output = model(text, text_len)

        # test 모델 출력에서 가장 높은 값을 가지는 index를 구함
        # 그 index가 class 번호 
        _, max_indices = torch.max(output, 1)
        max_indices = max_indices.data.cpu().numpy().tolist()

        y_pred.extend(max_indices)
        y_true.extend(labels.tolist())

  print('Classification Report:')
  print(classification_report(y_true, y_pred, labels=[0,1,2], digits=4))

#### **결과 예측**

In [33]:
teat = TabularDataset(path='./test.csv', format='csv',
                      fields=fields, skip_header=True)

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.long)
text_field = Field(tokenize=word_tokenize, lower=True, include_lengths=True, batch_first=True)
fields = [('text', text_field), ('labels', label_field)]

train_data, test_data = TabularDataset.splits(path="./", train='train.csv', test='test.csv',
                                  format='CSV', fields=fields, skip_header=True)

train_loader = BucketIterator(train_data, batch_size=32, sort_key=lambda x: len(x.text),
                              device=device, sort=True, sort_within_batch=True)

test_loader = BucketIterator(test_data, batch_size = 32, sort_key=lambda x : len(x.text), 
                             device = device, sort = True, sort_within_batch=True)

text_field.build_vocab(train_data, min_freq = 5)
vocab_size = len(text_field.vocab) # 1016개



In [34]:
model_path = './model/model.pt'

load_checkpoint(model_path, model, optim, device)

test(model, device, test_loader)

Classification Report:
              precision    recall  f1-score   support

           0     0.8835    0.3776    0.5291       241
           1     0.8139    0.9275    0.8670      1641
           2     0.8235    0.7304    0.7742       805

    accuracy                         0.8191      2687
   macro avg     0.8403    0.6785    0.7234      2687
weighted avg     0.8230    0.8191    0.8089      2687



