In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import json
import numpy as np
import math

In [2]:
train_data = []
train_inp_file = 'processed_data/GCDC/Clinton_train.jsonl'
with open(train_inp_file, 'r') as f:
    for line in f:
        json_obj = json.loads(line)
        # put <EOS> at the end of each sentence and add each sentence to the list
        for i in range(len(json_obj['sentences'])):
            json_obj['sentences'][i].append('<EOS>')
        # merge all sentences into one
        document = " ".join([word for sentence in json_obj['sentences'] for word in sentence])
        document = document.lower()
        label = json_obj['label']
        train_data.append((document, label))
   

test_data = []
test_inp_file = 'processed_data/GCDC/Clinton_test.jsonl'
with open(test_inp_file, 'r') as f:
    for line in f:
        json_obj = json.loads(line)
        # put <EOS> at the end of each sentence and add each sentence to the list
        for i in range(len(json_obj['sentences'])):
            json_obj['sentences'][i].append('<EOS>')
        # merge all sentences into one
        document = " ".join([word for sentence in json_obj['sentences'] for word in sentence])
        document = document.lower()
        label = json_obj['label']
        test_data.append((document, label))
        
print(len(train_data))
print(train_data[0])
print(train_data[1])
print(train_data[2])
print(len(test_data))
print(test_data[0])

# create vocabulary
vocab = set()
for document, label in train_data:
    for word in document.split():
        vocab.add(word.lower())
vocab = list(vocab)
vocab.append('<PAD>')
vocab.append('<UNK>')
print(len(vocab))
print(vocab[:10])


800
('two options the us views the transitional national council as the sole / only legitimate interlocutor of the libyan people during this interim period , as libyans come together to plan their own future and a permanent , inclusive constitutional system that protects the rights of all libyans . <eos> this is in contrast to the qadhafi regime , which has lost all legitimacy to rule . <eos> the us views the transitional national council as the legitimate interlocutor of the libyan people during this interim period , as libyans come together to plan their own future and a permanent , inclusive constitutional system that protects the rights of all libyans . <eos> this is in contrast to the qadhafi regime , which has lost all legitimacy to rule . <eos> the inc is the institution through which we are engaging the libyan people at this time . <eos>', 3)
("ambassador , we just received an email from the adoption service provider about these cases . <eos> i am currently reviewing the files 

In [3]:
# transform the documents into list of indices
def transform_doc(document, vocab):
    indices = []
    for word in document.split():
        if word.lower() in vocab:
            indices.append(vocab.index(word.lower()))
        else:
            indices.append(vocab.index('<UNK>'))
    return indices

# do it for all the sentences
train_data = [(transform_doc(document, vocab), label) for document, label in train_data]
test_data = [(transform_doc(document, vocab), label) for document, label in test_data]
# print(train_data[0])
# print(test_data[0])

# get the max length of the documents and also average length
total_len = 0
max_len = 0
count = 0
for document, label in train_data:
    max_len = max(max_len, len(document))
    total_len += len(document)
    count += 1

print(f'max doc length : {max_len}')
print(f'average doc length : {total_len/count}')

# pad the sentences to make them of same length
def pad_doc(document, max_len):
    if len(document) < max_len:
        document += [vocab.index('<PAD>')] * (max_len - len(document))
    return document

train_data = [(pad_doc(document, max_len), label) for document, label in train_data]
test_data = [(pad_doc(document, max_len), label) for document, label in test_data]
# print(train_data[0])
# print(len(train_data[0][0]))
# print(test_data[0])

# batchify the data after converting to tensors
class Batchify(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])
    
train_data = Batchify(train_data)
test_data = Batchify(test_data)

# create dataloaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

max doc length : 415
average doc length : 194.9475


In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(nhid, dropout)
        encoder_layers = TransformerEncoderLayer(nhid, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(vocab_size, nhid)
        self.nhid = nhid
        self.decoder = nn.Linear(nhid, 3)
        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask
        src = self.encoder(src) * math.sqrt(self.nhid)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)
        # return output
    

model = TransformerModel(len(vocab), 2, 128, 2)
model = model.to(device)
print(model)

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        

In [5]:
# define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# train the model
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        print(f'data shape : {data.shape}')
        target = target.to(device)
        print(f'target shape : {target.shape}')
        optimizer.zero_grad()
        output = model(data)
        print(f'output shape : {output.shape}')
        # loss = criterion(output.view(32, -1), target.view(-1))
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# test the model
def test(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            output = model(data)
            # loss = criterion(output.view(32, -1), target.view(-1))
            loss = criterion(output, target)
            total_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    return total_loss / len(test_loader), correct / len(test_loader.dataset)

# train and test the model
for epoch in range(1, 11):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    test_loss, test_acc = test(model, test_loader, criterion, device)
    print(f'Epoch : {epoch}, Train Loss : {train_loss}, Test Loss : {test_loss}, Test Accuracy : {test_acc}')

# save the model


data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
target shape : torch.Size([32])
output shape : torch.Size([32, 415, 3])
data shape : torch.Size([32, 415])
ta

RuntimeError: shape '[32, 1, 3]' is invalid for input of size 32