In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import json
import numpy as np
import math
from nltk import word_tokenize
import torchtext
from transformers import BertTokenizer, BertModel

batch_size = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
train_data = []
train_inp_file = 'processed_data/GCDC/Clinton_train.jsonl'
with open(train_inp_file, 'r') as f:
    for line in f:
        json_obj = json.loads(line)
        # put <EOS> at the end of each sentence and add each sentence to the list
        for i in range(len(json_obj['sentences'])):
            json_obj['sentences'][i].append('<EOS>')
        # merge all sentences into one
        document = " ".join([word for sentence in json_obj['sentences'] for word in sentence])
        document = document.lower()
        label = json_obj['label']
        train_data.append((document, label-1))
   

test_data = []
test_inp_file = 'processed_data/GCDC/Clinton_test.jsonl'
with open(test_inp_file, 'r') as f:
    for line in f:
        json_obj = json.loads(line)
        # put <EOS> at the end of each sentence and add each sentence to the list
        for i in range(len(json_obj['sentences'])):
            json_obj['sentences'][i].append('<EOS>')
        # merge all sentences into one
        document = " ".join([word for sentence in json_obj['sentences'] for word in sentence])
        document = document.lower()
        label = json_obj['label']
        test_data.append((document, label-1))
        
print(len(train_data))
print(train_data[0])
print(train_data[1])
print(train_data[2])
print(len(test_data))
print(test_data[0])

800
('two options the us views the transitional national council as the sole / only legitimate interlocutor of the libyan people during this interim period , as libyans come together to plan their own future and a permanent , inclusive constitutional system that protects the rights of all libyans . <eos> this is in contrast to the qadhafi regime , which has lost all legitimacy to rule . <eos> the us views the transitional national council as the legitimate interlocutor of the libyan people during this interim period , as libyans come together to plan their own future and a permanent , inclusive constitutional system that protects the rights of all libyans . <eos> this is in contrast to the qadhafi regime , which has lost all legitimacy to rule . <eos> the inc is the institution through which we are engaging the libyan people at this time . <eos>', 2)
("ambassador , we just received an email from the adoption service provider about these cases . <eos> i am currently reviewing the files 

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

print(tokenizer.vocab_size)
print(bert.config.hidden_size)

max_pad_len = 512
pad_id = 0

class GCDCDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        document, label = self.data[idx]
        tokenized_document = self.tokenizer.tokenize(document)
        indexed_document = self.tokenizer.convert_tokens_to_ids(tokenized_document)
        indexed_document = indexed_document[:max_pad_len]
        indexed_document = indexed_document + [pad_id] * (max_pad_len - len(indexed_document))
        return torch.tensor(indexed_document), torch.tensor(label)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


30522
768


In [5]:
    
train_dataset = GCDCDataset(train_data, tokenizer)
test_dataset = GCDCDataset(test_data, tokenizer)

print(len(train_dataset))
print(len(test_dataset))
print(train_dataset[0])
print(test_dataset[0])

800
200
(tensor([ 2048,  7047,  1996,  2149,  5328,  1996, 17459,  2120,  2473,  2004,
         1996,  7082,  1013,  2069, 11476,  6970,  4135, 12690,  2953,  1997,
         1996, 19232,  2111,  2076,  2023,  9455,  2558,  1010,  2004, 19232,
         2015,  2272,  2362,  2000,  2933,  2037,  2219,  2925,  1998,  1037,
         4568,  1010, 18678,  6543,  2291,  2008, 18227,  1996,  2916,  1997,
         2035, 19232,  2015,  1012,  1026,  1041,  2891,  1028,  2023,  2003,
         1999,  5688,  2000,  1996,  1053,  4215,  3270,  8873,  6939,  1010,
         2029,  2038,  2439,  2035, 22568,  2000,  3627,  1012,  1026,  1041,
         2891,  1028,  1996,  2149,  5328,  1996, 17459,  2120,  2473,  2004,
         1996, 11476,  6970,  4135, 12690,  2953,  1997,  1996, 19232,  2111,
         2076,  2023,  9455,  2558,  1010,  2004, 19232,  2015,  2272,  2362,
         2000,  2933,  2037,  2219,  2925,  1998,  1037,  4568,  1010, 18678,
         6543,  2291,  2008, 18227,  1996,  2916,  1997

In [6]:
# # create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [7]:
# define a classifier
class Classifier(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(Classifier, self).__init__()
        self.linear = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        return self.linear(x)
    
# define a transformer model
class Transformer(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(Transformer, self).__init__()
        self.bert = bert
        self.classifier = Classifier(hidden_size, num_classes)
        
    def forward(self, x):
        # x: (batch_size, seq_len)
        # output: (batch_size, hidden_size)
        output = self.bert(x)[0][:, 0, :]
        output = self.classifier(output)
        return output

In [8]:
# define a function to train the model
def train(model, optimizer, criterion, train_loader, test_loader, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for document, label in train_loader:
            # document: (batch_size, seq_len)
            # label: (batch_size)
            document = document.to(device)
            label = label.to(device)
            # output: (batch_size, num_classes)
            output = model(document)
            # output: (batch_size, num_classes)
            loss = criterion(output, label)
            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('Epoch: {}, Loss: {}'.format(epoch, loss.item()))
        test(model, test_loader)

# define a function to test the model
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for document, label in test_loader:
            # document: (batch_size, seq_len)
            # label: (batch_size)
            document = document.to(device)
            label = label.to(device)
            # output: (batch_size, num_classes)
            output = model(document)
            # print(output)
            # output: (batch_size)
            output = torch.argmax(output, dim=1)
            correct += torch.sum(torch.eq(output, label)).item()
            total += len(label)
    print('Accuracy: {}'.format(correct / total))

# define a function to predict the label of a document
def predict(model, document):
    model.eval()
    with torch.no_grad():
        # document: (seq_len)
        document = torch.LongTensor(document).unsqueeze(0).to(device)
        # output: (1, num_classes)
        output = model(document)
        # output: (1)
        output = torch.argmax(output, dim=1)
        return output.item()


In [9]:
# create a transformer model
hidden_size = bert.config.hidden_size
num_classes = 3
model = Transformer(hidden_size, num_classes).to(device)

# define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
# train the model
num_epochs = 10
train(model, optimizer, criterion, train_loader, test_loader, num_epochs)

KeyboardInterrupt: 

In [13]:
# print the labels of the test data
num_1 = 0
num_2 = 0
for document, label in train_dataset:
    print(label)
    if label == 2:
        num_2 += 1
    elif label == 1:
        num_1 += 1

print(800 - num_1 - num_2)
print(num_1)
print(num_2)

tensor(2)
tensor(0)
tensor(1)
tensor(2)
tensor(0)
tensor(2)
tensor(0)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(0)
tensor(2)
tensor(0)
tensor(1)
tensor(2)
tensor(2)
tensor(1)
tensor(1)
tensor(0)
tensor(2)
tensor(2)
tensor(0)
tensor(2)
tensor(1)
tensor(0)
tensor(2)
tensor(1)
tensor(2)
tensor(0)
tensor(2)
tensor(1)
tensor(0)
tensor(1)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(1)
tensor(2)
tensor(0)
tensor(1)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(0)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(0)
tensor(2)
tensor(2)
tensor(2)
tensor(0)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(0)
tensor(2)
tensor(1)
tensor(1)
tensor(1)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(1)
tensor(2)
tensor(0)
tensor(1)
tensor(1)
tensor(2)
tensor(0)
tensor(1)
tensor(2)
tensor(0)
tensor(2)
tensor(0)
tensor(2)
tensor(2)
tensor(1)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
tensor(2)
