<a href="https://colab.research.google.com/github/nicolaiberk/nlpdl_project/blob/main/PressReleaseEst_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, DistilBertTokenizerFast
import os
import torch
import pandas as pd

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-german-cased", num_labels = 6)
model.load_state_dict(torch.load(os.path.join("drive", "MyDrive", "nlpdl", "BERTBaseline.bin")))

Some weights of the model checkpoint at distilbert-base-german-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias

<All keys matched successfully>

In [None]:
# load news articles
news = pd.read_csv(os.path.join("drive", "MyDrive", "nlpdl", "subset_news.csv"))
news = news.reset_index()
news = news.dropna()

In [None]:
texts = list(news["text"])
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-german-cased")

In [None]:
# tokenize texts
news_encodings = tokenizer(texts, truncation=True, padding=True)

In [None]:
# generate fake labels for the data class

labels = news['source']

# there are probably better ways to do this
ulabels = list(set(labels))
label_dict = {}

for i in range(len(ulabels)):
  label_dict[str(ulabels[i])] = i

labels = [label_dict[str(l)] for l in labels]

In [None]:
class NEWSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

news_dataset = NEWSDataset(news_encodings, labels)

In [None]:
trainer = Trainer(
    model=model
)

In [None]:
eval_res = trainer.predict(news_dataset)

In [None]:
pd.crosstab(news.source, eval_res.predictions.argmax(-1))

col_0,0,1,2,4,5
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
faz,41,607,10,9,323
spon,66,522,5,2,79
taz,315,162,3,2,105
welt,45,613,14,7,277


In [None]:
news['green'] = eval_res.predictions[:,0]
news['union'] = eval_res.predictions[:,1]
news['afd']   = eval_res.predictions[:,2]
news['spd']   = eval_res.predictions[:,3]
news['linke']  = eval_res.predictions[:,4]
news['fdp']   = eval_res.predictions[:,5]

In [None]:
news.to_csv(os.path.join("drive", "MyDrive", "nlpdl", "subset_news_pred.csv"))