In [1]:
import os
import pandas as pd
import torch

from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

In [18]:
class ArticleDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.__data = data

    def __len__(self):
        return self.__data.shape[0]

    def __getitem__(self, idx):
        row = self.__data.iloc[idx]

        target_label = row['idealogyLabel']
        article_text_location = row['articleText']

        with open(os.path.join('data', article_text_location)) as fp:
            article_text = fp.read()

        print(article_text_location)
        return article_text, target_label

In [17]:
class ArticleProfilerModel(nn.Module):
    def __init__(self, dropout=0.5, amount_of_categories=10):
        super(ArticleProfilerModel, self).__init__()

        config = BertConfig(max_position_embeddings=4096)

        self.__tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.__model = BertModel.from_pretrained("bert-base-multilingual-cased", config=config)

        self.__dropout = nn.Dropout(dropout)
        self.__linear = nn.Linear(768, amount_of_categories)
        self.__relu = nn.ReLU()


    def forward(self, text):
        print("f1")
        encoded_input = self.__tokenizer(text, return_tensors='pt')
        print("f2")
        output = self.__model(**encoded_input).pooler_output
        print("f3")

        print(f'output = {output[0]}')
        dropout_output = self.__dropout(output)
        linear_output = self.__linear(dropout_output)
        final_output = self.__relu(linear_output)

        return final_output

In [19]:
def train_loop(model, train_dataloader, epochs, loss_fn, optimizer):
    for epoch in range(epochs):
        size = len(train_dataloader.dataset)
        print(f'Epoch {epoch}')
        for batch, (X, y) in enumerate(train_dataloader):
            print("1")
            prediction = model(X)
            print("2")
            loss = loss_fn(prediction, y)
            print("3")

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            if batch % 100 == 0:
                loss, current = loss.item(), batch * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(self, dataloader):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        test_loss, correct = 0, 0

        with torch.no_grad():
            for X, y in dataloader:
                pred = self.__model(X)
                test_loss += self.__loss_fn(pred, y).item()
                correct += (pred.argmax(1) == y).type(torch.float).sum().item()

        test_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [20]:
full_data = pd.read_csv(os.path.join('data', 'full_data.csv'))
train_data, test_data = train_test_split(full_data, random_state=RANDOM_STATE)

train_dataset = ArticleDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=1)

test_dataset = ArticleDataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=1)

model = ArticleProfilerModel()

learning_rate = 1e-6
optimizer = Adam(model.parameters(), lr=learning_rate)

RuntimeError: Error(s) in loading state_dict for BertModel:
	size mismatch for bert.embeddings.word_embeddings.weight: copying a param with shape torch.Size([119547, 768]) from checkpoint, the shape in current model is torch.Size([30522, 768]).
	size mismatch for bert.embeddings.position_embeddings.weight: copying a param with shape torch.Size([512, 768]) from checkpoint, the shape in current model is torch.Size([4096, 768]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [13]:
train_loop(model, train_dataloader, epochs=10, loss_fn=nn.CrossEntropyLoss(), optimizer=optimizer)

Epoch 0
elpais/10.txt
1
f1
f2


RuntimeError: The size of tensor a (1127) must match the size of tensor b (512) at non-singleton dimension 1