In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from itertools import chain
from scipy.stats import pearsonr
from sklearn.utils import shuffle
from time import time

In [None]:
torch.manual_seed(1)

In [None]:
train_df = shuffle(pd.read_csv('./data/data_train.csv'))
test_df = pd.read_csv('./data/data_test.csv')
with open('./data/stopwords.txt', encoding='utf8') as f:
    stop_words = set([x.strip() for x in f.readlines()])

In [None]:
train_corpus = [x.split() for x in train_df['Text'].to_list()]
train_labels = train_df['MaxLabel'].to_list()
test_corpus =  [x.split() for x in test_df['Text'].to_list()]
test_labels = test_df['MaxLabel'].to_list()
test_labels_distribution = test_df.iloc[:, 3:11].values.tolist()


In [None]:
tmp_train_corpus = list()
tmp_test_corpus = list()
for s in train_corpus:
  tmp = list()
  for x in s:
    if x not in stop_words:
      tmp.append(x)
  tmp_train_corpus.append(tmp)
for s in test_corpus:
  tmp = list()
  for x in s:
    if x not in stop_words:
      tmp.append(x)
  tmp_test_corpus.append(tmp)
train_corpus = tmp_train_corpus
test_corpus = tmp_test_corpus

In [None]:
train_corpus_max_length = np.max(([len(x) for x in train_corpus]))
# padding all sentences to the same length(i.e. the max sentence length in the corpus)
for s in train_corpus:
    sentence_length = len(s)
    for i in range(train_corpus_max_length - sentence_length):
        s.append('')
train_sentence_num = len(train_corpus)
train_sentence_length = len(train_corpus[0])
print('train sentence count:', train_sentence_num)
print('train sentence length:', train_sentence_length)

In [None]:
# index the words
word_to_ix = np.load('./data/word_to_ix.npy', allow_pickle=True).item() # index the words
vocab_size = len(set(chain.from_iterable(train_corpus)))

In [None]:
train_index = torch.zeros([0, train_sentence_length], dtype=torch.long)
for s in train_corpus:
    ids = torch.tensor([word_to_ix[w] for w in s], dtype=torch.long)
    ids = ids.unsqueeze(dim = 0)
    train_index = torch.cat((train_index, ids), dim = 0)
train_index = train_index.long()[:, 0:1000]
train_index.size()

In [None]:
test_corpus_max_length = np.max([len(x) for x in test_corpus])
# padding all sentences to the same length(i.e. the max sentence length in the corpus)
for s in test_corpus:
    sentence_length = len(s)
    for i in range(test_corpus_max_length - sentence_length):
        s.append('')
test_sentence_num = len(test_corpus)
test_sentence_length = len(test_corpus[0])
print('test sentence count:', test_sentence_num)
print('test sentence length:', test_sentence_length)

In [None]:
test_index = torch.zeros([0, test_sentence_length], dtype=torch.long)
for s in test_corpus:
    ids_list = list()
    not_found_num = 0
    for w in s:
        if w in word_to_ix:
            ids_list.append(word_to_ix[w])
        else:
            not_found_num += 1
            # ids_list.append(word_to_ix[''])
    for i in range(not_found_num):
      ids_list.append(word_to_ix[''])
    ids = torch.tensor(ids_list, dtype=torch.long)
    ids = ids.unsqueeze(dim = 0)
    test_index = torch.cat((test_index, ids), dim = 0)
test_index = test_index.long()[:, 0:1000]
test_index.size()

In [None]:
import torch.utils.data
train_length = len(train_labels)
mid = int(0.8 * train_length)
train_set = torch.utils.data.TensorDataset(train_index[0:mid], torch.tensor(train_labels[0:mid]).long())
test_set = torch.utils.data.TensorDataset(test_index, torch.tensor(test_labels).long())
dev_set = torch.utils.data.TensorDataset(train_index[mid:train_length], torch.tensor(train_labels[mid:train_length]).long())


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)

In [None]:
import torch.nn as nn
class Net(nn.Module):
    sentence_length = -1
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Net, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 8)
        self.dropout = nn.Dropout(p=0.5)
    def forward(self, x):
        embeds = self.embeddings(x)
        lstm_out, _ = self.lstm(embeds)
        tag = self.dropout(self.fc(lstm_out[:, -1, :].squeeze(dim=1)))
        tag_prob = nn.functional.softmax(tag, dim = 1)
        return tag_prob



In [None]:
def test_accuracy(loader, net, length):
    '''
    used in dev set accuracy calculating
    '''
    correct = 0
    total = 0
    net.train(False)
    with torch.no_grad():
        for _, data in enumerate(tqdm(loader)):
            # inputs, label = data
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = net(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0) # batch size
            correct += (predicted == labels).sum().item()
    return correct / (0.0 + total)

In [None]:
embedding_dim = 300
lr = 0.01
batch_size = 4
dev_patience = 25
hidden_dim = 64

In [None]:
train_loader = torch.utils.data.DataLoader(train_set, shuffle=True, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1)
dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=batch_size)
net = Net(vocab_size, embedding_dim, hidden_dim)
net.embeddings = nn.Embedding.from_pretrained(torch.load('./data/word_vec.pt'))
net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, weight_decay = 0.01)
model_path = './model/RNN_EMBED_cuda.pth'

In [None]:
print(net)

In [None]:
def check_dev_patience(acc_list, patience = 5):
    if len(acc_list) <= patience:
        return True
    check_list = acc_list[len(acc_list)-patience:len(acc_list)]
    for i in range(patience - 2):
        if check_list[i] - check_list[i + 1] >= 0.01 and check_list[i + 1] - check_list[i + 2] >= 0.01:
            return False
    if np.std(check_list) < 1e-4:
        return False
    return True

In [None]:
print('\nbefore training test acc:', test_accuracy(test_loader, net, test_sentence_length))

In [None]:
epochs = 500
loss_batch = 25
dev_acc_list = [0] # initialize with a zero, easy to compare before first dev acc comes in
loss_list = []
epoch_list = []
trained_epoch_num = 0
start_time = time()
print('Start Training, lr=%f, bs=%d, embedding dim=%d from pretrained'%(lr, batch_size, embedding_dim))
for epoch in range(epochs):  # loop over the dataset multiple times
    try:
        running_loss = 0.0
        epoch_loss = 0.0
        cnt = 0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data[0].to(device), data[1].to(device)
            optimizer.zero_grad()
            net.train(True)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            cnt += 1
            if i % loss_batch == loss_batch - 1:
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / loss_batch))
                running_loss = 0.0

        dev_accuracy = test_accuracy(dev_loader, net, train_sentence_length)
        print('epoch %d loss: %.4f, dev acc = %.3f%%' % (epoch + 1, epoch_loss / cnt, dev_accuracy * 100))
        if dev_accuracy > max(dev_acc_list):
            print('New Model Saved!')
            torch.save(net.state_dict(), model_path)
        dev_acc_list.append(dev_accuracy)
        loss_list.append(epoch_loss / cnt)
        epoch_list.append(epoch + 1)
        no_big_improve_on_dev = check_dev_patience(dev_acc_list, dev_patience)
        if not no_big_improve_on_dev:
            print('No significant improve on dev set, early stopped automatically!')
            break
        trained_epoch_num += 1
        
    except KeyboardInterrupt:
      break
print('Finished Training, trained for %d epochs, total time = %.2fs, avg time per epoch = %.2fs' % (trained_epoch_num, time() - start_time, (time() - start_time) / trained_epoch_num))

In [None]:
correct = 0
total = 0
net.load_state_dict(torch.load(model_path))
net.to(device)
predicted_label = list()
groundTruth_label = list()
corr = 0.0
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle = False)

with torch.no_grad():
    for i, data in enumerate(tqdm(test_loader)):
        inputs, labels = data[0].to(device), data[1].to(device)
        net.train(False)
        outputs = net(inputs)
        groundTruth_label.extend(labels.tolist())
        _, predicted = torch.max(outputs.data, 1)
        predicted_label.extend(predicted.tolist())
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        corr += pearsonr(outputs[0].cpu(), test_labels_distribution[i])[0]
    corr /= test_sentence_length
print('\nAccuracy of the network: %.4f %%' % (100 * correct / total))
print('Macro F1 score: %.2f'%f1_score(groundTruth_label, predicted_label, average='macro'))
print('Micro F1 score: %.2f'%f1_score(groundTruth_label, predicted_label, average='micro'))
print('Corr:' , corr)

In [None]:
print('Macro F1 score: %.3f'%f1_score(groundTruth_label, predicted_label, average='macro'))
print('Micro F1 score: %.3f'%f1_score(groundTruth_label, predicted_label, average='micro'))

In [None]:
acc_list = [100 * x for x in dev_acc_list[1:]]
fig = plt.figure()
ax = fig.add_subplot(111)
lns1 = ax.plot(epoch_list, acc_list, '-r', label='accuracy')
ax2 = ax.twinx()
lns2 = ax2.plot(epoch_list, loss_list, label = 'loss')
lns = lns1 + lns2
labs = [l.get_label() for l in lns]
ax.legend(lns, labs, loc=0)
ax.grid()
ax.set_xlabel('Epochs')
ax.set_ylabel('dev accuracy / %')
ax2.set_ylabel('loss')
ax.set_ylim(max(min(acc_list) - 10, 0), 100)
ax2.set_ylim(np.mean(loss_list) - 2 * np.std(loss_list), np.mean(loss_list) + 2 * np.std(loss_list))
plt.title('dev accuracy&loss epoch=%d lr=%f bs=%d maxacc=%.2f%%'%(max(epoch_list),lr,batch_size,max(acc_list)))
# plt.savefig('./drive/My Drive/SentimentalClassification/code/pics/LSTMEpoch%dlr%fbatchsize%dmaxacc%.2f.jpg' % (max(epoch_list), lr, batch_size, max(acc_list)))
plt.show()


In [None]:
# print(predicted_label)
# print(groundTruth_label)
# test_accuracy(train_loader, net, train_sentence_length)
