In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from scipy.stats import pearsonr

In [None]:
train_df = pd.read_csv('./data/data_train.csv')
test_df = pd.read_csv('./data/data_test.csv')

In [None]:
train_corpus = train_df['Text'].to_list()
train_labels = train_df['MaxLabel'].to_list()
test_corpus = test_df['Text'].to_list()
test_labels = test_df['MaxLabel'].to_list()
test_labels_distribution = test_df.iloc[:, 3:11].values.tolist()

In [None]:
with open('./data/stopwords.txt', encoding='utf8') as f:
    stop_words = [x.strip() for x in f.readlines()]

In [None]:
train_vectorizer = CountVectorizer(max_features=3000, stop_words=stop_words)
train_X = train_vectorizer.fit_transform(train_corpus)
train_feature_words = train_vectorizer.get_feature_names()
train_feature_array = train_X.toarray()
len(train_feature_words)

In [None]:
test_vectorizer = CountVectorizer(vocabulary=train_feature_words)
test_X = test_vectorizer.fit_transform(test_corpus)
test_feature_array = test_X.toarray()

In [None]:
def normalize_input(feature_array):
    feature_tensor = torch.Tensor(feature_array).float()
    max_tensor = torch.max(feature_tensor, dim=1)[0].unsqueeze(1)
    return torch.div(feature_tensor, max_tensor).tolist()

In [None]:
# train_feature_array = normalize_input(train_feature_array)
# test_feature_array = normalize_input(test_feature_array)

In [None]:
import torch.utils.data
train_length = len(train_labels)
mid = int(0.8 * train_length)
train_set = torch.utils.data.TensorDataset(torch.tensor(train_feature_array[0:mid]).float(), torch.tensor(train_labels[0:mid]).long())
test_set = torch.utils.data.TensorDataset(torch.tensor(test_feature_array).float(), torch.tensor(test_labels).long())
dev_set = torch.utils.data.TensorDataset(torch.tensor(train_feature_array[mid:train_length]).float(), torch.tensor(train_labels[mid:train_length]).long())


In [None]:
batch_size = 4
train_loader = torch.utils.data.DataLoader(train_set, shuffle=True, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)
dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=batch_size)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(len(train_feature_words), 8)
        

    def forward(self, x):
        x = self.fc1(x)
        x = F.softmax(x, dim = 1)
        return x

In [None]:
def test_accuracy(loader, net):
    '''
    used in dev set accuracy calculating
    '''
    correct = 0
    total = 0
    with torch.no_grad():
        for data in loader:
            inputs, label = data
            outputs = net(inputs)
            _, predicted = torch.max(outputs, 1)
            total += label.size(0) # batch size

            correct += (predicted == label).sum().item()
    return correct / (0.0 + total)

In [None]:
net = Net()

criterion = nn.CrossEntropyLoss()
lr = 0.005
optimizer = optim.SGD(net.parameters(), lr=lr)
model_path = './model/MLP_model_BOW.pth'

In [None]:
def check_dev_patience(acc_list, patience = 5):
    if len(acc_list) <= patience:
        return True
    check_list = acc_list[len(acc_list)-patience:len(acc_list)]
    for i in range(patience - 1):
        if check_list[i] - check_list[i + 1] >= 0.01:
            return False
    if np.std(check_list) < 1e-4:
        return False
    return True

In [None]:
print('before test', test_accuracy(test_loader, net))

In [None]:
epochs = 100
loss_batch = 50
dev_acc_list = [0] # initialize with a zero, easy to compare before first dev acc comes in
loss_list = []
epoch_list = []
dev_patience = 5
print('Start Training')
for epoch in range(epochs):  # loop over the dataset multiple times
    try:
        running_loss = 0.0
        epoch_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % loss_batch == loss_batch - 1:
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / loss_batch))
                running_loss = 0.0
    except KeyboardInterrupt:
        exit_here = input('Early stop manually? (y/n) (default n)')
        if exit_here.lower().startswith('y'):
            print('Early stopped manually!')
            break
    finally:
        dev_accuracy = test_accuracy(dev_loader, net)
        print('epoch %d dev acc = %.2f' % (epoch + 1, dev_accuracy * 100))
        if dev_accuracy > dev_acc_list[-1]:
            print('New Model Saved!')
            torch.save(net.state_dict(), model_path)

        dev_acc_list.append(dev_accuracy)
        loss_list.append(epoch_loss/train_length)
        epoch_list.append(epoch + 1)
        no_big_improve_on_dev = check_dev_patience(dev_acc_list, dev_patience)
        if not no_big_improve_on_dev:
            print('No significant improve on dev set, early stopped automatically!')
            break
print('Finished Training')

In [None]:
correct = 0
total = 0
net.load_state_dict(torch.load(model_path))
predicted_label = list()
groundTruth_label = list()
with torch.no_grad():
    for i, data in enumerate(tqdm(test_loader)):
        inputs, labels = data
        outputs = net(inputs)
        groundTruth_label.extend(labels.tolist())
        _, predicted = torch.max(outputs.data, 1)
        predicted_label.extend(predicted.tolist())
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('\nAccuracy of the network: %.4f %%' % (100 * correct / total))
print('Macro F1 score: %.3f'%f1_score(groundTruth_label, predicted_label, average='macro'))
print('Micro F1 score: %.3f'%f1_score(groundTruth_label, predicted_label, average='micro'))

In [None]:
test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle = False)
corr = 0.0
with torch.no_grad():
    for i, data in enumerate(tqdm(test_loader)):
        # inputs, labels = data
        inputs, labels = data
        net.train(False)
        outputs = net(inputs)
        corr += pearsonr(outputs[0].cpu(), test_labels_distribution[i])[0]
    corr /= len(test_feature_array)
print('\nCorr:', corr)

In [None]:
acc_list = [100 * x for x in dev_acc_list[1:]]
fig = plt.figure()
ax = fig.add_subplot(111)
lns1 = ax.plot(epoch_list, acc_list, '-r', label='accuracy')
ax2 = ax.twinx()
lns2 = ax2.plot(epoch_list, loss_list, label = 'loss')
lns = lns1 + lns2
labs = [l.get_label() for l in lns]
ax.legend(lns, labs, loc=0)
ax.grid()
ax.set_xlabel('Epochs')
ax.set_ylabel('dev accuracy / %')
ax2.set_ylabel('loss')
ax.set_ylim(max(min(acc_list) - 10, 0), 100)
ax2.set_ylim(np.mean(loss_list) - 3 * np.std(loss_list), np.mean(loss_list) + 3 * np.std(loss_list))
ax2.set_ylim(max(min(loss_list) - 1 * np.std(loss_list), 0), min(max(loss_list) + 1 * np.std(loss_list), 1))
plt.title('dev accuracy&loss epoch=%d lr=%f bs=%d maxacc=%.2f%%'%(max(epoch_list),lr,batch_size,max(acc_list)))
# plt.savefig('DevEpoch%dlr%fbatchsize%dmaxacc%.2f.jpg' % (max(epoch_list), lr, batch_size, max(acc_list)))
plt.show()


In [None]:
# print(predicted_label)

In [None]:
# print(groundTruth_label)