In [74]:
import numpy as np
import os
import pandas as pd
import torch

from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, BertTokenizer, BertModel, BertConfig, DistilBertTokenizer, DistilBertModel, DistilBertConfig
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

In [83]:
class ArticleDataset(torch.utils.data.Dataset):
    def __init__(self, data, label_mapping):
        self.__data = data
        self.__label_mapping = label_mapping

    def __len__(self):
        return self.__data.shape[0]

    def __getitem__(self, idx):
        row = self.__data.iloc[idx]

        target_label = row['idealogyLabel']
        article_text_location = row['articleText']

        with open(os.path.join('data', article_text_location)) as fp:
            article_text = fp.read()

        return article_text, self.__label_mapping[target_label]

In [78]:
class ArticleProfilerModel(nn.Module):
    def __init__(self, dropout=0.5, amount_of_categories=10):
        super(ArticleProfilerModel, self).__init__()

        config = BertConfig(max_position_embeddings=4096)

        self.__tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.__model = BertModel.from_pretrained("bert-base-multilingual-cased")

        # config = DistilBertConfig(max_position_embeddings=4096)

        # self.__tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
        # self.__model = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")

        self.__dropout = nn.Dropout(dropout)
        self.__linear = nn.Linear(768, amount_of_categories)
        self.__relu = nn.ReLU()


    def forward(self, text):
        encoded_input = self.__tokenizer(text, return_tensors='pt', truncation=True)
        output = self.__model(**encoded_input).pooler_output

        dropout_output = self.__dropout(output)
        linear_output = self.__linear(dropout_output)
        final_output = self.__relu(linear_output)

        return final_output

In [90]:
def train_loop(model, train_dataloader, epochs, loss_fn, optimizer):
    for epoch in range(epochs):
        size = len(train_dataloader.dataset)
        print(f'Epoch {epoch}')
        for batch, (X, y) in enumerate(train_dataloader):

            prediction = model(X)

            loss = loss_fn(prediction, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            if batch % 100 == 0:
                loss, current = loss.item(), batch * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(self, dataloader):
        size = len(dataloader.dataset)
        num_batches = len(dataloader)
        test_loss, correct = 0, 0

        with torch.no_grad():
            for X, y in dataloader:
                pred = self.__model(X)
                test_loss += self.__loss_fn(pred, y).item()
                correct += (pred.argmax(1) == y).type(torch.float).sum().item()

        test_loss /= num_batches
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [91]:
full_data = pd.read_csv(os.path.join('data', 'full_data.csv'))
train_data, test_data = train_test_split(full_data, random_state=RANDOM_STATE)

label_mapping = {item: index for index, item in enumerate(full_data['idealogyLabel'].value_counts().keys())}

train_dataset = ArticleDataset(train_data, label_mapping)
train_dataloader = DataLoader(train_dataset, batch_size=1)

test_dataset = ArticleDataset(test_data, label_mapping)
test_dataloader = DataLoader(test_dataset, batch_size=1)

model = ArticleProfilerModel()

learning_rate = 1e-6
optimizer = Adam(model.parameters(), lr=learning_rate)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_loop(model, train_dataloader, epochs=10, loss_fn=nn.CrossEntropyLoss(), optimizer=optimizer)

Epoch 0


{'right-wing': 0,
 'left-wing': 1,
 'liberalism': 2,
 'centre-left': 3,
 'conservatism': 4,
 'social democracy': 5,
 'centre-right': 6,
 'communism': 7,
 'centrism': 8}