In [None]:

! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
! mkdir train
! mkdir working

In [7]:
!tar -xzvf ag_news_csv.tar.gz

ag_news_csv/
ag_news_csv/train.csv
ag_news_csv/test.csv
ag_news_csv/classes.txt
ag_news_csv/readme.txt


In [5]:
!kaggle datasets download -d 'rtatman/glove-global-vectors-for-word-representation'
!unzip glove-global-vectors-for-word-representation.zip

Downloading glove-global-vectors-for-word-representation.zip to /content
 99% 455M/458M [00:15<00:00, 34.6MB/s]
100% 458M/458M [00:15<00:00, 30.1MB/s]
Archive:  glove-global-vectors-for-word-representation.zip
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.50d.txt        


data process

In [6]:
from torchtext import vocab
glove = vocab.Vectors('glove.6B.100d.txt', 'train/')

print(f'Shape of GloVe vectors is {glove.vectors.shape}')

100%|█████████▉| 399999/400000 [00:21<00:00, 18854.98it/s]


Shape of GloVe vectors is torch.Size([400000, 100])


In [9]:
import pandas as pd
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import csv
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np


class MyDataset(Dataset):

    def __init__(self, data_path, max_length_sentences=30, max_length_word=35):
        super(MyDataset, self).__init__()

        texts, labels = [], []
        with open(data_path) as csv_file:
            reader = csv.reader(csv_file, quotechar='"')
            for idx, line in enumerate(reader):
                text = ""
                for tx in line[1:]:
                    text += tx.lower()
                    text += ". "
                label = int(line[0])
                texts.append(text)
                labels.append(label)

        self.texts = texts
        self.labels = labels

        self.max_length_sentences = max_length_sentences
        self.max_length_word = max_length_word
        self.num_classes = len(set(self.labels))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        label = self.labels[index]
        text = self.texts[index]
        document_encode = [
            [glove[word].numpy() for word in word_tokenize(text=sentences)] for sentences
            in
            sent_tokenize(text=text)]

        for sentences in document_encode:
            if len(sentences) < self.max_length_word:
                extended_words = [glove['pad'].numpy() for _ in range(self.max_length_word - len(sentences))]
                sentences.extend(extended_words)

        if len(document_encode) < self.max_length_sentences:
            extended_sentences = [[glove['pad'].numpy() for _ in range(self.max_length_word)] for _ in
                                  range(self.max_length_sentences - len(document_encode))]
            document_encode.extend(extended_sentences)

        document_encode = [sentences[:self.max_length_word] for sentences in document_encode][
                          :self.max_length_sentences]

        document_encode = np.stack(arrays=document_encode, axis=0)
        document_encode += 1

        return document_encode, label


if __name__ == '__main__':
    test = MyDataset(data_path="ag_news_csv/test.csv")
    #print (test.__getitem__(index=0)[0].shape)
    #print (test.__getitem__(index=0)[0])
    # print (test.__getitem__(index=0)[1]) # lable in(1,..,4)
    # print (test.__len__())


In [10]:
train_set = MyDataset(data_path="ag_news_csv/train.csv")


# class_weights = [0.25, 0.25, 0.25, 0.25]
# sampler = WeightedRandomSampler(weights=class_weights, num_samples=len(train_set), replacement=True)
# dataloader = DataLoader(train_set, batch_size=32, sampler=sampler)


dataloader = DataLoader(train_set, batch_size=32,shuffle=True)


val_set = MyDataset(data_path="ag_news_csv/test.csv")
val_dataloader = DataLoader(val_set, batch_size=32,shuffle=True)

# test_set = MyDataset(data_path="ag_news_csv/test.csv")
# test_dataloader = DataLoader(test_set, batch_size=32,shuffle=True)




build model

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [10]:
def get_max_lengths(data_path):
    word_length_list = []
    sent_length_list = []
    with open(data_path) as csv_file:
        reader = csv.reader(csv_file, quotechar='"')
        for idx, line in enumerate(reader):
            text = ""
            for tx in line[1:]:
                text += tx.lower()
                text += " "
            sent_list = sent_tokenize(text)
            sent_length_list.append(len(sent_list))

            for sent in sent_list:
                word_list = word_tokenize(sent)
                word_length_list.append(len(word_list))

        sorted_word_length = sorted(word_length_list)
        sorted_sent_length = sorted(sent_length_list)

    return sorted_word_length[int(0.8*len(sorted_word_length))], sorted_sent_length[int(0.8*len(sorted_sent_length))]

In [12]:
class HierAttNet(nn.Module):
    def __init__(self, word_hidden_size=50, sent_hidden_size=50, batch_size=32, num_classes=4,
                 max_sent_length=30, max_word_length=35):
        super(HierAttNet, self).__init__()
        self.batch_size = batch_size
        self.word_hidden_size = word_hidden_size
        self.sent_hidden_size = sent_hidden_size
        self.max_sent_length = max_sent_length
        self.max_word_length = max_word_length
        self.word_att_net = WordAttNet(word_hidden_size)
        self.sent_att_net = SentAttNet(sent_hidden_size)
        self._init_hidden_state()

        self.dense = torch.nn.Linear(sent_hidden_size*2,num_classes,bias=True)

    def _init_hidden_state(self, last_batch_size=None):

        if last_batch_size:
            batch_size = last_batch_size
        else:
            batch_size = self.batch_size
        self.word_hidden_state = torch.zeros(2, batch_size, self.word_hidden_size)
        self.sent_hidden_state = torch.zeros(2, batch_size, self.sent_hidden_size)
        if torch.cuda.is_available():
            self.word_hidden_state = self.word_hidden_state.cuda()
            self.sent_hidden_state = self.sent_hidden_state.cuda()

    def forward(self, input):

        output_list = []

        input = input.permute(1, 0, 2, 3)
        for i in input: # với mỗi câu i
            output, self.word_hidden_state = self.word_att_net(i.permute(1, 0, 2), self.word_hidden_state)

            output_list.append(output)

        #output = torch.cat(output_list, 0)

        output = torch.stack(output_list, dim=0)


        output, self.sent_hidden_state = self.sent_att_net(output, self.sent_hidden_state)
        #32 100

        output= self.dense(output)
        output=F.softmax(output,dim=1)
        return output

In [13]:
class SentAttNet(nn.Module):
    def __init__(self,  hidden_size=50):
        super(SentAttNet, self).__init__()

        self.sent_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 10))
        self.sent_bias = nn.Parameter(torch.Tensor(10))
        self.context_weight = nn.Parameter(torch.Tensor(10, 1))


        self.gru = nn.GRU(100, hidden_size, bidirectional=True)
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):

        self.sent_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):


        f_output, h_output = self.gru(input, hidden_state)  # feature output and hidden state output
        #32 30 100

        output = matrix_mul(f_output, self.sent_weight,  self.context_weight, self.sent_bias)
        output = F.softmax(output) # 30 32

        # 30 32 1 * 30 32 100
        output = element_wise_mul(output.permute(1,0,2),f_output.permute(1,0,2))

        return output, h_output


In [14]:

class WordAttNet(nn.Module):
    def __init__(self,  hidden_size=50):
        super(WordAttNet, self).__init__()

        self.word_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 10))
        self.word_bias = nn.Parameter(torch.Tensor(10))
        self.context_weight = nn.Parameter(torch.Tensor(10, 1))


        self.gru = nn.GRU(100, hidden_size, bidirectional=True)
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):

        self.word_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):


        f_output, h_output = self.gru(input, hidden_state)  # feature output and hidden state output
        #32 35 100

        output = matrix_mul(f_output, self.word_weight,  self.context_weight, self.word_bias)
        output = F.softmax(output) # 35 32

        # 35 32 1 * 35 32 100
        output = element_wise_mul(output.permute(1,0,2),f_output.permute(1,0,2))

        return output, h_output

def element_wise_mul(input1, input2):

    feature_list = []
    for feature_1, feature_2 in zip(input1, input2):
        #feature_1 = feature_1.squeeze(dim=2)
        feature = (feature_1 * feature_2).sum(dim=0)
        #print(feature.shape)
        feature_list.append(feature)

    output = torch.stack(feature_list, dim=0)
    return output


def matrix_mul(input, weight, context_weight,  bias=False):


    #feature = feature.unsqueeze(1)
    input = torch.matmul(input, weight)
    input = input + bias
    input = torch.tanh(input)

    input = torch.matmul(input, context_weight)

    return input

In [None]:
model = HierAttNet()
model

# Train model

In [15]:
import time
from tqdm import tqdm
def train_network(network,train_iter,optimizer,loss_fn,epoch_num,device):

    epoch_loss = 0 # loss per epoch
    epoch_acc = 0 # accuracy per epoch
    network.train() # set the model in training mode as it requires gradients calculation and updtion

    network.zero_grad() # clear all the calculated grdients from previous step


    # turn off while testing using  model.eval() and torch.no_grad() block

    for batch in tqdm(train_iter,f"Epoch: {epoch_num}"):
        batch =[t.to(device) for t in batch]
        # data will be shown to model in batches per epoch to calculate gradients per batch

        optimizer.zero_grad()
        network._init_hidden_state()
        predictions = network(batch[0])
        one_hot_batch = F.one_hot((batch[1]-1).to(torch.int64), num_classes=4).to(torch.float)
        loss = torch.nn.CrossEntropyLoss()(predictions, one_hot_batch)
        loss.backward()
        optimizer.step()

        # predictions = network(batch[0])
        # one_hot_batch = F.one_hot((batch[1]-1).to(torch.int64), num_classes=4).to(torch.float)


        # loss = loss_fn(predictions,one_hot_batch) # calculate loss on the whole batch


        # Tạo một tensor mới với giá trị 1 ở vị trí tương ứng với giá trị lớn nhất và 0 cho các vị trí khác
        pred_classes = torch.zeros_like(predictions)
        pred_classes[torch.arange(len(predictions)), torch.argmax(predictions, dim=1)] = 1

        correct_preds = (pred_classes * one_hot_batch).float()

        accuracy = correct_preds.sum()/len(correct_preds)

        # # below two are must and should be used only after calculation of Loss by optimizer
        # loss.backward() # Start Back Propagation so that model can calculate gradients based on loss
        # optimizer.step() # update the weights based on gradient corresponding to each neuron
        # optimizer.zero_grad() # clear all the calculated grdients from previous step


        epoch_loss += loss.item()  # add the loss for this batch to calculate the loss for whole epoch
        epoch_acc += accuracy.item() # .item() tend to give the exact number from the tensor of shape [1,]




        time.sleep(0.001) # for tqdm progess bar

    return epoch_loss/len(train_iter), epoch_acc/len(train_iter)

In [16]:
def evaluate_network(network,val_test_iter,optimizer,loss_fn,device):
    '''
    evaluate the network using given parameters
    args:
        network: any Neural Network object
        val_test_iter: iterator of validation/test data
        optimizer: optimizer for gradients calculation and updation
        loss_fn: appropriate loss function
    out:
        a tuple of (average_loss,average_accuracy) of floating values for the incoming dataset
    '''

    total_loss = 0  # total loss for the whole incoming data
    total_acc = 0 # total accuracy for the whole data

    network.eval() # set the model in evaluation mode to not compute gradients and reduce overhead

    with torch.no_grad(): # turn of gradients calculation

        for batch in val_test_iter:
            batch =[t.to(device) for t in batch]

            if len(batch[1]) < 32:
              network._init_hidden_state(last_batch_size=len(batch[1]))
            else:
              network._init_hidden_state()

            predictions = network(batch[0])
            one_hot_batch = F.one_hot((batch[1]-1).to(torch.int64), num_classes=4).to(torch.float)

            loss = loss_fn(predictions,one_hot_batch)

            pred_classes = torch.zeros_like(predictions)
            pred_classes[torch.arange(len(predictions)), torch.argmax(predictions, dim=1)] = 1

            correct_preds = (pred_classes * one_hot_batch).float()

            accuracy = correct_preds.sum()/len(correct_preds)

            total_loss += loss.item()
            total_acc += accuracy.item()

        return total_loss/len(val_test_iter), total_acc/len(val_test_iter)

# Run

In [56]:
from tkinter.constants import E


network = HierAttNet()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lr = 3e-4
optimizer = torch.optim.Adam(network.parameters(),lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()


network.to(device)

# optimizer and losses remains the same

EPOCH = 3
for epoch in range(EPOCH):
    train_loss, train_acc = train_network(network,dataloader,optimizer,loss_fn,epoch+1,device)
    val_loss,val_acc = evaluate_network(network,val_dataloader,optimizer,loss_fn,device)
    tqdm.write(f'''End of Epoch: {epoch+1}  |  Train Loss: {train_loss:.3f}  |  Val Loss: {val_loss:.3f}  |  Train Acc: {train_acc*100:.2f}%  |  Val Acc: {val_acc*100:.2f}%''')


torch.save(network.state_dict(), 'model.pth')


  output = F.softmax(output) # 35 32
Epoch: 1: 100%|██████████| 3750/3750 [22:44<00:00,  2.75it/s]


16
End of Epoch: 1  |  Train Loss: 0.951  |  Val Loss: 0.869  |  Train Acc: 78.60%  |  Val Acc: 87.45%


Epoch: 2: 100%|██████████| 3750/3750 [22:49<00:00,  2.74it/s]


16
End of Epoch: 2  |  Train Loss: 0.867  |  Val Loss: 0.857  |  Train Acc: 87.44%  |  Val Acc: 88.50%


Epoch: 3: 100%|██████████| 3750/3750 [23:06<00:00,  2.71it/s]


16
End of Epoch: 3  |  Train Loss: 0.856  |  Val Loss: 0.849  |  Train Acc: 88.62%  |  Val Acc: 89.43%


In [None]:
model = HierAttNet()
model.load_state_dict(torch.load('model.pth'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
network.to(device)

val_loss,val_acc = evaluate_network(model,val_dataloader,optimizer,loss_fn,device)
tqdm.write(f'''End:  |  Test Loss: {val_loss:.3f}  |   Test Acc: {val_acc*100:.2f}%''')
