In [1]:
import sentencepiece as spm
import pandas as pd
import urllib.request
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence


  from .autonotebook import tqdm as notebook_tqdm


Parameters

In [2]:
batch_size = 16
vocab_size = 2000
embedding_size = 32

Tokenize text

In [3]:
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')
with open('train.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['text']))

spm.SentencePieceTrainer.Train('--input=train.txt --model_prefix=train --vocab_size={0} --model_type=unigram --max_sentence_length=9999'.format(vocab_size))
vocab_list = pd.read_csv('train.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)

sp = spm.SentencePieceProcessor()
sp.load('train.model')

True

Define x_train, y_train tensor

In [4]:
data_train = [0 for _ in range(len(train_df))]
for i in range(len(train_df)):
    data_train[i] = torch.FloatTensor(sp.encode_as_ids(train_df['text'][i]))

data_valid = [0 for _ in range(len(valid_df))]
for i in range (len(valid_df)):
    data_valid[i] = torch.FloatTensor(sp.encode_as_ids(valid_df['text'][i]))

x_train = torch.transpose(pad_sequence(data_train), 0, 1)
y_train = torch.FloatTensor(train_df['target'])

data_valid.append(torch.zeros(size=[x_train.shape[1]]))
x_valid = torch.transpose(pad_sequence(data_valid), 0, 1)[0:-1]
y_valid = torch.FloatTensor(valid_df['target'])

x_train_maxlen = x_train.shape[-1]


print()




Define Dataset

In [5]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x_data = x
        self.y_data = y
    
    def __len__(self):
        return len(self.y_data)
    
    def __getitem__(self, idx):
        x = torch.FloatTensor(self.x_data[idx])
        y = torch.FloatTensor(self.y_data[idx])

        return x, y

train_set = CustomDataset(x_train, y_train)
valid_set = CustomDataset(x_valid, y_valid)

trainloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validloader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)

Define Positional Embedding

In [6]:
class PositionalEmbedding(nn.Module):
    def __init__(self, max_len, vocab_size, embedding_dim):
        super(PositionalEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embedding_dim)
        self.pos_emb = nn.Embedding(max_len, embedding_dim)
    
    def forward(self, x):
        max_len = x.shape[-1]
        positions = torch.arange(start=0, end=max_len, step=1)
        positions = self.pos_emb(positions.long())
        x = self.token_emb(x.long())
        return x + positions

Define Multi Head Attention

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads=8):
        super(MultiHeadAttention, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_heads = num_heads

        assert embedding_dim % self.num_heads == 0

        self.projection_dim = embedding_dim // num_heads
        self.Wq = nn.Linear(embedding_dim, embedding_dim) 
        self.Wk = nn.Linear(embedding_dim, embedding_dim)
        self.Wv = nn.Linear(embedding_dim, embedding_dim)
        self.Wo = nn.Linear(embedding_dim, embedding_dim)

    def self_attention(self, query, key, value):
        # query, key, value = (batch size, num heads, seq len, projection dim)
        matmul_qk = torch.matmul(query, torch.transpose(key, -1, -2))
        # matmul_qk = (batch size, num heads, seq len, seq len)
        projection_dim = torch.FloatTensor([key.shape[-1]])
        scores = matmul_qk / torch.sqrt(projection_dim)
        attention_weights = torch.softmax(scores, axis=-1)
        # attention_weights = (batch size, num heads, seq len, seq len)
        output = torch.matmul(attention_weights, value)
        # output = (batch size, num heads, seq len, projection dim) * 8-head weighted sum of values
        return output, attention_weights
    
    def split_heads(self, x, batch_size):
        # x = (batch size, seq len, embedding dim)
        # output = (batch size, num heads, seq len, embedding dim / num heads)
        x = torch.reshape(x, [batch_size, self.num_heads, -1, self.projection_dim])
        return x
    
    def forward(self, inputs):
        # input shape = (batch size, seq len, embedding dim)
        batch_size = inputs.shape[0]

        # define query, key, value
        query = self.Wq(inputs)
        key = self.Wk(inputs)
        value = self.Wv(inputs)

        # multi-head split q, k, v
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)
        
        # input, output = (batch size, num heads, seq len, projection dim)
        scaled_attention, _ = self.self_attention(query, key, value)

        # reshape to concat -> (batch size, seq len, num heads, projection dim)
        scaled_attention = torch.transpose(scaled_attention, 1, 2)

        # concat -> (batch size, seq len, num heads * projection dim = embedding dim)
        concat_attention = torch.reshape(scaled_attention, shape=[batch_size, -1, self.embedding_dim])

        # linear embedding dim -> embedding dim
        outputs = self.Wo(concat_attention)
        return outputs


Define Transformer Block

In [8]:
seq_len = x_train_maxlen

class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, dff, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttention(embedding_dim, num_heads)
        self.ffn = nn.Sequential(nn.Linear(embedding_dim, dff), nn.ReLU(), nn.Linear(dff, embedding_dim))
        self.layernorm1 = nn.LayerNorm(normalized_shape=[seq_len, embedding_dim], eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape=[seq_len, embedding_dim], eps=1e-6)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.1)

    def forward(self, x):
        a1 = self.dropout1(self.multi_head_attention(x))
        a1 = self.layernorm1(x + a1)
        a2 = self.dropout2(self.ffn(a1))
        a2 = self.layernorm2(a1 + a2)
        return a2


Define Total Neural Net

In [9]:
num_heads = 4
dff = 16
dropout = 0.1

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embedding = PositionalEmbedding(x_train_maxlen, vocab_size, embedding_size)
        self.transformer_block = TransformerBlock(embedding_size, num_heads, dff, dropout)
        self.avg_global_pool = nn.AvgPool1d(kernel_size=seq_len)
        self.dropout = nn.Dropout(0.1)
        self.dense = nn.Sequential(nn.Linear(embedding_size, 20), nn.ReLU(), nn.Dropout(0.1), nn.Linear(20, 2), nn.Softmax(dim=1))
        

    def forward(self, x):
        positional_embedding = self.embedding(x)
        transformer_output = self.transformer_block(positional_embedding)
        transformer_output = torch.transpose(transformer_output, 1, 2)
        transformer_output_2d = torch.squeeze(self.avg_global_pool(transformer_output))
    
        dense_output = self.dense(self.dropout(transformer_output_2d))
        dense_output = torch.transpose(dense_output, 0, 1)
        final_scores = dense_output[0]

        return final_scores

Define Train Network

In [10]:
def train(net, optimizer, trainloader, validloader, num_epoch, model_path):
    for epoch in range(num_epoch):
        total_loss = 0
        total_acc = 0
        for i, data in enumerate(trainloader, 0):
            optimizer.zero_grad()
            x, y = data
            result = net(x)
            loss = F.binary_cross_entropy(result, y)
            acc = torch.sum(torch.where(abs(result-y)<0.5, 1, 0))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_acc += acc.item()
        
        total_loss = total_loss / len(trainloader)
        total_acc = total_acc / (len(trainloader) * batch_size)
        valid_loss, valid_acc = valid(net, validloader)
        print("epoch: %d train loss: %.3f train acc: %.3f valid loss: %.3f valid acc: %.3f" % (epoch+1, total_loss, total_acc, valid_loss, valid_acc))

def valid(net, dataloader):
    net.eval()
    total_loss = 0
    total_acc = 0
    for i, data in enumerate(dataloader, 0):
        x, y = data
        result = net(x)
        loss = F.binary_cross_entropy(result, y)
        acc = torch.sum(torch.where(abs(result-y)<0.5, 1, 0))
        total_loss += loss.item()
        total_acc += acc.item()
    
    total_loss = total_loss / len(dataloader)
    total_acc = total_acc / (len(dataloader) * batch_size)

    net.train()
    return total_loss, total_acc

In [None]:
lr = 0.001
betas = (0.9, 0.999)
net = Net()

optimizer = torch.optim.Adam(params=net.parameters(), weight_decay=0.001, lr=lr, betas=betas, eps=1e-8)
train(net, optimizer, trainloader, validloader, 20, None)


Import Pre-Trained BERT

In [126]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
bert_model.to('cuda')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

Re-Define x_train, x_valid

In [None]:
with torch.no_grad():

    text_train = [0 for _ in range(len(train_df))]
    for i in range(len(train_df)):
        text_train[i] = train_df['text'][i]

    text_valid = [0 for _ in range(len(valid_df))]
    for i in range(len(valid_df)):
        text_valid[i] = valid_df['text'][i]

    x_valid = tokenizer(text_valid, return_tensors='pt', padding='max_length', max_length=128).to('cuda')
    print(x_valid)
    output = bert_model(**x_valid)
    print(output)




In [144]:
torch.cuda.empty_cache()

print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

7751665152
7811891200
