In [1]:

! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

ref                                                         title                                           size  lastUpdated          downloadCount  voteCount  usabilityRating  
----------------------------------------------------------  ---------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
sudarshan24byte/online-food-dataset                         Online Food Dataset                              3KB  2024-03-02 18:50:30          22780        454  0.9411765        
alistairking/electricity-prices                             U.S. Electricity Prices                          1MB  2024-04-07 19:18:37            688         24  1.0              
fatemehmehrparvar/obesity-levels                            Obesity Levels                                  58KB  2024-04-07 16:28:30            790         24  0.88235295       
sukhmandeepsinghbrar/housing-price-dataset                  Housing Price Dataset                        

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
!tar -xzvf ag_news_csv.tar.gz

ag_news_csv/
ag_news_csv/train.csv
ag_news_csv/test.csv
ag_news_csv/classes.txt
ag_news_csv/readme.txt


In [3]:
!kaggle datasets download -d 'rtatman/glove-global-vectors-for-word-representation'
!unzip glove-global-vectors-for-word-representation.zip

Downloading glove-global-vectors-for-word-representation.zip to /content
100% 458M/458M [00:18<00:00, 31.4MB/s]
100% 458M/458M [00:18<00:00, 26.4MB/s]
Archive:  glove-global-vectors-for-word-representation.zip
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.50d.txt        


data process

In [4]:
from torchtext import vocab
glove = vocab.Vectors('glove.6B.100d.txt', 'train/')

print(f'Shape of GloVe vectors is {glove.vectors.shape}')

100%|█████████▉| 399999/400000 [00:22<00:00, 17718.28it/s]


Shape of GloVe vectors is torch.Size([400000, 100])


In [None]:
print(glove['a'].numpy())

In [6]:
import pandas as pd
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import csv
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import torch.nn.functional as F



class MyDataset(Dataset):

    def __init__(self, data_path, max_length_sentences=30, max_length_word=35, transform=None, target_transform=None):
        super(MyDataset, self).__init__()

        texts, labels = [], []
        with open(data_path) as csv_file:
            reader = csv.reader(csv_file, quotechar='"')
            for idx, line in enumerate(reader):
                text = ""
                for tx in line[1:]:
                    text += tx.lower()
                    text += ". "
                label = int(line[0])
                texts.append(text)
                labels.append(label)
        self.transform = transform
        self.target_transform = target_transform
        self.texts = texts
        self.labels = labels

        self.max_length_sentences = max_length_sentences
        self.max_length_word = max_length_word
        self.num_classes = len(set(self.labels))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        label = self.labels[index]
        text = self.texts[index]
        document_encode = [
            [glove[word].numpy() for word in word_tokenize(text=sentences)] for sentences
            in
            sent_tokenize(text=text)]

        for sentences in document_encode:
            if len(sentences) < self.max_length_word:
                extended_words = [glove['<pad>'].numpy() for _ in range(self.max_length_word - len(sentences))]
                sentences.extend(extended_words)

        if len(document_encode) < self.max_length_sentences:
            extended_sentences = [[glove['<pad>'].numpy() for _ in range(self.max_length_word)] for _ in
                                  range(self.max_length_sentences - len(document_encode))]
            document_encode.extend(extended_sentences)

        document_encode = [sentences[:self.max_length_word] for sentences in document_encode][
                          :self.max_length_sentences]

        document_encode = np.stack(arrays=document_encode, axis=0)

        return torch.from_numpy(document_encode), F.one_hot(torch.tensor(label-1), num_classes=4).to(torch.float)


if __name__ == '__main__':
    test = MyDataset(data_path="ag_news_csv/test.csv")
    print (test.__getitem__(index=0)[1])



tensor([0., 0., 1., 0.])


In [None]:
print(f"eminem is represented by the index location at: {glove.stoi['pad']} and has the following vector values: \n {glove['eminem']}")
print(glove['<pad>'])

eminem is represented by the index location at: 10109 and has the following vector values: 
 tensor([ 0.7544, -0.0373,  0.9011, -0.2475,  0.6046,  0.2633,  0.4321, -0.0581,
         0.2454,  0.4725,  0.4708,  0.6857, -0.1124, -0.5863,  0.6069,  0.2199,
         0.3303, -0.4111,  0.4995,  0.5576,  0.5199, -0.7013,  0.1960,  0.0222,
         0.1784,  1.2870,  0.0808, -0.0457,  0.6343, -0.1123,  0.5205,  0.5357,
         0.6573,  1.4612, -0.5139,  0.1027,  0.3214,  0.3201, -0.2493, -0.1228,
         0.7139,  0.5747, -0.0808,  0.0766, -0.8888, -0.6424, -0.3580, -0.1718,
         0.0162, -0.4686, -0.1671, -0.2047,  0.3036,  0.0647, -0.4121, -0.6413,
         0.0488, -0.1874, -1.0129,  0.3502, -0.0286,  0.8700, -0.1154,  0.0711,
         0.8362,  0.0659,  1.2272,  0.4069,  0.0153,  0.0362, -0.3664,  0.8553,
        -0.8979,  0.5335, -0.2803, -0.4116,  0.1657, -0.0971,  0.8507, -1.0010,
         0.6556, -0.2359,  0.1473,  0.4138, -0.9671,  0.0302, -0.4406, -0.0727,
         0.1327,  0.0797, -

In [7]:


train_set = MyDataset(data_path="ag_news_csv/train.csv")


# class_weights = [0.25, 0.25, 0.25, 0.25]
# sampler = WeightedRandomSampler(weights=class_weights, num_samples=len(train_set), replacement=True)
# dataloader = DataLoader(train_set, batch_size=32, sampler=sampler)


dataloader = DataLoader(train_set, batch_size=32,shuffle=True)


val_set = MyDataset(data_path="ag_news_csv/test.csv")
val_dataloader = DataLoader(val_set, batch_size=32,shuffle=True)

# test_set = MyDataset(data_path="ag_news_csv/test.csv")
# test_dataloader = DataLoader(test_set, batch_size=32,shuffle=True)




build model

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [21]:
!pip install seqeval==0.0.12

Collecting seqeval==0.0.12
  Downloading seqeval-0.0.12.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-py3-none-any.whl size=7415 sha256=d1be1a71b79ec6ca22fe573f2fabe5508379f082c6b0d1e107ed1d7f1824a50c
  Stored in directory: /root/.cache/pip/wheels/6c/6c/fc/7076d687ba54f32c7be7eaaded97df359ef3c8fee08a2d4efc
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [33]:
from seqeval.metrics import precision_score, recall_score, f1_score

def get_acc(network,val_test_iter,device):
    network.eval() # set the model in evaluation mode to not compute gradients and reduce overhead
    with torch.no_grad(): # turn of gradients calculation
        preds =[]
        labels=[]
        for batch in val_test_iter:
            batch =[t.to(device) for t in batch]

            if len(batch[1]) < 32:
              network._init_hidden_state(last_batch_size=len(batch[1]))
            else:
              network._init_hidden_state()
            pred = network(batch[0])
            preds.append(pred)
            labels.append(batch[1])

        preds = torch.cat(preds, dim=0)
        labels = torch.cat(labels, dim=0)
        assert len(preds) == len(labels)
        return preds, labels
        return {
            "acc_precision": precision_score(labels, preds),
            "acc_recall": recall_score(labels, preds),
            "acc_f1": f1_score(labels, preds)
        }




def get_max_lengths(data_path):
    word_length_list = []
    sent_length_list = []
    with open(data_path) as csv_file:
        reader = csv.reader(csv_file, quotechar='"')
        for idx, line in enumerate(reader):
            text = ""
            for tx in line[1:]:
                text += tx.lower()
                text += " "
            sent_list = sent_tokenize(text)
            sent_length_list.append(len(sent_list))

            for sent in sent_list:
                word_list = word_tokenize(sent)
                word_length_list.append(len(word_list))

        sorted_word_length = sorted(word_length_list)
        sorted_sent_length = sorted(sent_length_list)

    return sorted_word_length[int(0.8*len(sorted_word_length))], sorted_sent_length[int(0.8*len(sorted_sent_length))]

In [9]:
class HierAttNet(nn.Module):
    def __init__(self, word_hidden_size=50, sent_hidden_size=50, batch_size=32, num_classes=4,
                 max_sent_length=30, max_word_length=35):
        super(HierAttNet, self).__init__()
        self.batch_size = batch_size
        self.word_hidden_size = word_hidden_size
        self.sent_hidden_size = sent_hidden_size
        self.max_sent_length = max_sent_length
        self.max_word_length = max_word_length
        self.word_att_net = WordAttNet(word_hidden_size)
        self.sent_att_net = SentAttNet(sent_hidden_size)
        self._init_hidden_state()

        self.dense = torch.nn.Linear(sent_hidden_size*2,num_classes,bias=True)

    def _init_hidden_state(self, last_batch_size=None):

        if last_batch_size:
            batch_size = last_batch_size
        else:
            batch_size = self.batch_size
        self.word_hidden_state = torch.zeros(2, batch_size, self.word_hidden_size)
        self.sent_hidden_state = torch.zeros(2, batch_size, self.sent_hidden_size)
        if torch.cuda.is_available():
            self.word_hidden_state = self.word_hidden_state.cuda()
            self.sent_hidden_state = self.sent_hidden_state.cuda()

    def forward(self, input):

        output_list = []

        input = input.permute(1, 0, 2, 3)
        for i in input: # với mỗi câu i
            output, self.word_hidden_state = self.word_att_net(i.permute(1, 0, 2), self.word_hidden_state)

            output_list.append(output)

        #output = torch.cat(output_list, 0)

        output = torch.stack(output_list, dim=0)


        output, self.sent_hidden_state = self.sent_att_net(output, self.sent_hidden_state)
        #32 100

        output= self.dense(output)
        output=F.softmax(output,dim=1)
        return output

In [10]:
class SentAttNet(nn.Module):
    def __init__(self,  hidden_size=50):
        super(SentAttNet, self).__init__()

        self.sent_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 10))
        self.sent_bias = nn.Parameter(torch.Tensor(10))
        self.context_weight = nn.Parameter(torch.Tensor(10, 1))


        self.gru = nn.GRU(100, hidden_size, bidirectional=True)
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):

        self.sent_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):


        f_output, h_output = self.gru(input, hidden_state)  # feature output and hidden state output
        #32 30 100

        output = matrix_mul(f_output, self.sent_weight,  self.context_weight, self.sent_bias)
        output = F.softmax(output) # 30 32

        # 30 32 1 * 30 32 100
        output = element_wise_mul(output.permute(1,0,2),f_output.permute(1,0,2))

        return output, h_output


In [11]:

class WordAttNet(nn.Module):
    def __init__(self,  hidden_size=50):
        super(WordAttNet, self).__init__()

        self.word_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 10))
        self.word_bias = nn.Parameter(torch.Tensor(10))
        self.context_weight = nn.Parameter(torch.Tensor(10, 1))


        self.gru = nn.GRU(100, hidden_size, bidirectional=True)
        self._create_weights(mean=0.0, std=0.05)

    def _create_weights(self, mean=0.0, std=0.05):

        self.word_weight.data.normal_(mean, std)
        self.context_weight.data.normal_(mean, std)

    def forward(self, input, hidden_state):


        f_output, h_output = self.gru(input, hidden_state)  # feature output and hidden state output
        #32 35 100

        output = matrix_mul(f_output, self.word_weight,  self.context_weight, self.word_bias)
        output = F.softmax(output) # 35 32

        # 35 32 1 * 35 32 100
        output = element_wise_mul(output.permute(1,0,2),f_output.permute(1,0,2))

        return output, h_output

def element_wise_mul(input1, input2):

    feature_list = []
    for feature_1, feature_2 in zip(input1, input2):
        #feature_1 = feature_1.squeeze(dim=2)
        feature = (feature_1 * feature_2).sum(dim=0)
        #print(feature.shape)
        feature_list.append(feature)

    output = torch.stack(feature_list, dim=0)
    return output


def matrix_mul(input, weight, context_weight,  bias=False):


    #feature = feature.unsqueeze(1)
    input = torch.matmul(input, weight)
    input = input + bias
    input = torch.tanh(input)

    input = torch.matmul(input, context_weight)

    return input

In [12]:
model = HierAttNet()
model

HierAttNet(
  (word_att_net): WordAttNet(
    (gru): GRU(100, 50, bidirectional=True)
  )
  (sent_att_net): SentAttNet(
    (gru): GRU(100, 50, bidirectional=True)
  )
  (dense): Linear(in_features=100, out_features=4, bias=True)
)

# Train model

In [13]:
import time
from tqdm import tqdm
def train_network(network,train_iter,optimizer,loss_fn,epoch_num,device):

    epoch_loss = 0 # loss per epoch
    epoch_acc = 0 # accuracy per epoch
    network.train() # set the model in training mode as it requires gradients calculation and updtion

    network.zero_grad() # clear all the calculated grdients from previous step


    # turn off while testing using  model.eval() and torch.no_grad() block

    for batch in tqdm(train_iter,f"Epoch: {epoch_num}"):
        batch =[t.to(device) for t in batch]
        # data will be shown to model in batches per epoch to calculate gradients per batch
        if len(batch[1]) < 32:
              network._init_hidden_state(last_batch_size=len(batch[1]))
        else:
              network._init_hidden_state()
        optimizer.zero_grad()

        predictions = network(batch[0])

        loss = loss_fn(predictions, batch[1])
        loss.backward()
        optimizer.step()


        pred_classes = torch.zeros_like(predictions)
        pred_classes[torch.arange(len(predictions)), torch.argmax(predictions, dim=1)] = 1

        correct_preds = (pred_classes * batch[1]).float()

        accuracy = correct_preds.sum()/len(correct_preds)


        epoch_loss += loss.item()  # add the loss for this batch to calculate the loss for whole epoch
        epoch_acc += accuracy.item() # .item() tend to give the exact number from the tensor of shape [1,]




        time.sleep(0.001) # for tqdm progess bar

    return epoch_loss/len(train_iter), epoch_acc/len(train_iter)

In [14]:
def evaluate_network(network,val_test_iter,optimizer,loss_fn,device):
    '''
    evaluate the network using given parameters
    args:
        network: any Neural Network object
        val_test_iter: iterator of validation/test data
        optimizer: optimizer for gradients calculation and updation
        loss_fn: appropriate loss function
    out:
        a tuple of (average_loss,average_accuracy) of floating values for the incoming dataset
    '''

    total_loss = 0  # total loss for the whole incoming data
    total_acc = 0 # total accuracy for the whole data

    network.eval() # set the model in evaluation mode to not compute gradients and reduce overhead

    with torch.no_grad(): # turn of gradients calculation

        for batch in val_test_iter:
            batch =[t.to(device) for t in batch]

            if len(batch[1]) < 32:
              network._init_hidden_state(last_batch_size=len(batch[1]))
            else:
              network._init_hidden_state()

            predictions = network(batch[0])


            loss = loss_fn(predictions,batch[1])

            pred_classes = torch.zeros_like(predictions)
            pred_classes[torch.arange(len(predictions)), torch.argmax(predictions, dim=1)] = 1

            correct_preds = (pred_classes * batch[1]).float()

            accuracy = correct_preds.sum()/len(correct_preds)

            total_loss += loss.item()
            total_acc += accuracy.item()

        return total_loss/len(val_test_iter), total_acc/len(val_test_iter)

# Run

In [15]:

network = HierAttNet()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lr = 3e-4
optimizer = torch.optim.Adam(network.parameters(),lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()


network.to(device)

# optimizer and losses remains the same

EPOCH = 3
for epoch in range(EPOCH):
    train_loss, train_acc = train_network(network,dataloader,optimizer,loss_fn,epoch+1,device)
    val_loss,val_acc = evaluate_network(network,val_dataloader,optimizer,loss_fn,device)
    tqdm.write(f'''End of Epoch: {epoch+1}  |  Train Loss: {train_loss:.3f}  |  Val Loss: {val_loss:.3f}  |  Train Acc: {train_acc*100:.2f}%  |  Val Acc: {val_acc*100:.2f}%''')


torch.save(network.state_dict(), 'model.pth')


  output = F.softmax(output) # 35 32
Epoch: 1: 100%|██████████| 3750/3750 [27:21<00:00,  2.28it/s]


End of Epoch: 1  |  Train Loss: 0.897  |  Val Loss: 0.854  |  Train Acc: 85.02%  |  Val Acc: 88.97%


Epoch: 2: 100%|██████████| 3750/3750 [27:12<00:00,  2.30it/s]


End of Epoch: 2  |  Train Loss: 0.843  |  Val Loss: 0.842  |  Train Acc: 89.93%  |  Val Acc: 90.05%


Epoch: 3: 100%|██████████| 3750/3750 [27:29<00:00,  2.27it/s]


End of Epoch: 3  |  Train Loss: 0.837  |  Val Loss: 0.841  |  Train Acc: 90.51%  |  Val Acc: 90.13%


In [20]:
model = HierAttNet()
model.load_state_dict(torch.load('model.pth'))

lr = 3e-4
optimizer = torch.optim.Adam(network.parameters(),lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


val_loss,val_acc = evaluate_network(model,val_dataloader,optimizer,loss_fn,device)
tqdm.write(f'''End:  |  Test Loss: {val_loss:.3f}  |   Test Acc: {val_acc*100:.2f}%''')


  output = F.softmax(output) # 35 32


End:  |  Test Loss: 0.841  |   Test Acc: 90.13%
