Load BERT model.

In [20]:
from transformers import BertModel, BertTokenizer
import torch


device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BertTokenizer.from_pretrained("./bert")
model = BertModel.from_pretrained("./bert").to(device)
def bert_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model(**encoded_input)
    return output.last_hidden_state.cpu().detach()

Some weights of the model checkpoint at ./bert were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load dataset.

In [94]:
import pandas


train_pd = pandas.read_csv("./data/train.csv")
test_pd = pandas.read_csv("./data/test.csv")
train_X_pos, train_X_neg = [], []
test_X = []
for i in range(train_pd.shape[0]):
    line = train_pd.iloc[i]
    bert_output = bert_embedding(line["text"])
    if line["target"] == 1:
        train_X_pos.append(bert_output)
    elif line["target"] == 0:
        train_X_neg.append(bert_output)
for i in range(test_pd.shape[0]):
    line = test_pd.iloc[i]
    bert_output = bert_embedding(line["text"])
    test_X.append(bert_output)

Define model class.

In [75]:
import torch


class Decoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bilstm = torch.nn.LSTM(768, 16, batch_first=True, bidirectional=True)
        self.attention = torch.nn.MultiheadAttention(32, 1, batch_first=True)
        self.fc = torch.nn.Linear(32, 2)

    def forward(self, x):
        x, _ = self.bilstm(x)
        x, _ = self.attention(x, x, x)
        x = x[:, 0, :]
        x = self.fc(x)
        return x

Split train and validation set. Prapare bert output as dataloader.

In [56]:
import random


def align(X, max_length=32):
    if X.shape[0] < max_length:
        X = torch.concat([X, torch.zeros((max_length - X.shape[0], X.shape[1]))], dim=0)
    return X[:max_length, :]

random.seed(1)
select = [(align(X[0]), 1) for X in train_X_pos] + [(align(X[0]), 0) for X in train_X_neg]
random.shuffle(select)
train_set = select[:]
val_set = select[int(0.9 * len(select)):]
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=128, shuffle=False)

Train decoder.

In [89]:
decoder = Decoder().to(device)
epochs = 50
optimizer = torch.optim.Adam(decoder.parameters(), lr=1e-4, weight_decay=1e-5)
criterion = torch.nn.CrossEntropyLoss().to(device)
for epoch in range(epochs):
    decoder.train()
    for data, label in train_loader:
        data, label = data.to(device), label.to(device)
        output = decoder(data)
        loss = criterion(output, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    decoder.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for data, label in train_loader:
            data, label = data.to(device), label.to(device)
            output = torch.argmax(decoder(data), dim=1)
            correct += torch.sum(output == label)
            total += data.size(0)
    print(f"epoch: {epoch}, train acc: {correct / total}")
    correct, total = 0, 0
    with torch.no_grad():
        for data, label in val_loader:
            data, label = data.to(device), label.to(device)
            output = torch.argmax(decoder(data), dim=1)
            correct += torch.sum(output == label)
            total += data.size(0)
    print(f"epoch: {epoch}, val acc: {correct / total}")

epoch: 0, train acc: 0.7422829270362854
epoch: 0, val acc: 0.7362204790115356
epoch: 1, train acc: 0.7816892266273499
epoch: 1, val acc: 0.7847769260406494
epoch: 2, train acc: 0.8036253452301025
epoch: 2, val acc: 0.8044619560241699
epoch: 3, train acc: 0.8141337037086487
epoch: 3, val acc: 0.8188976049423218
epoch: 4, train acc: 0.8221462965011597
epoch: 4, val acc: 0.8188976049423218
epoch: 5, train acc: 0.8323919177055359
epoch: 5, val acc: 0.8215222954750061
epoch: 6, train acc: 0.8347563147544861
epoch: 6, val acc: 0.8280839920043945
epoch: 7, train acc: 0.8427689075469971
epoch: 7, val acc: 0.8438320159912109
epoch: 8, train acc: 0.8476290106773376
epoch: 8, val acc: 0.8438320159912109
epoch: 9, train acc: 0.852357804775238
epoch: 9, val acc: 0.8490813374519348
epoch: 10, train acc: 0.8530145883560181
epoch: 10, val acc: 0.8556430339813232
epoch: 11, train acc: 0.8622093796730042
epoch: 11, val acc: 0.8582677245140076
epoch: 12, train acc: 0.8636542558670044
epoch: 12, val acc: 

Print the metrics of the trained models on training set and validation set.

In [93]:
from sklearn.metrics import f1_score, precision_score, recall_score


y_pred = torch.zeros(0)
y_true = torch.zeros(0)
decoder.eval()
with torch.no_grad():
    for data, label in train_loader:
        data, label = data.to(device), label.to(device)
        output = torch.argmax(decoder(data), dim=1)
        y_pred = torch.concat((y_pred, output.cpu()))
        y_true = torch.concat((y_true, label.cpu()))
print(f"train precision: {precision_score(y_true, y_pred)}")
print(f"train recall: {recall_score(y_true, y_pred)}")
print(f"train f1: {f1_score(y_true, y_pred)}")
y_pred = torch.zeros(0)
y_true = torch.zeros(0)
decoder.eval()
with torch.no_grad():
    for data, label in val_loader:
        data, label = data.to(device), label.to(device)
        output = torch.argmax(decoder(data), dim=1)
        y_pred = torch.concat((y_pred, output.cpu()))
        y_true = torch.concat((y_true, label.cpu()))
print(f"val precision: {precision_score(y_true, y_pred)}")
print(f"val recall: {recall_score(y_true, y_pred)}")
print(f"val f1: {f1_score(y_true, y_pred)}")

train precision: 0.9959901295496607
train recall: 0.9871598899419138
train f1: 0.991555350836788
val precision: 1.0
val recall: 0.9820359281437125
val f1: 0.9909365558912386


Predict on the test set.

In [118]:
output_pd = pandas.DataFrame(test_pd["id"])
test_pred = []
for i in range(len(test_X)):
    data = test_X[i].to(device)
    pred = int(torch.argmax(decoder(data), dim=1))
    test_pred.append(pred)
output_pd["target"] = test_pred
output_pd.to_csv("./data/submission.csv", index=False)