In [46]:
!pip install transformers


# Importing the libraries needed
import pandas as pd
import numpy as np
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.metrics import f1_score, accuracy_score





In [22]:
# Import the csv into pandas dataframe and add the headers
df = pd.read_csv("/content/drive/MyDrive/nlp_project_nico/data/PPRs_cleaned.csv", engine = "python")
df.tail()


Unnamed: 0,text,label,date
0,Zum Amtsantritt von Fritz Kuhn als Oberbürgerm...,Greens,2013-01-07
1,indischen Frauen Zu den anhaltenden P ten in I...,Greens,2013-01-07
2,Zur Pressekonferenz von Kultus- und Wissenscha...,Greens,2013-01-09
3,"Zu Presseberichten über den -Vorschlag, das Ki...",Greens,2013-01-09
4,Zu den Vorschlägen der für die Einführung ein...,Greens,2013-01-09


In [3]:
df.label.unique()

array(['Greens', 'AfD', 'FDP', 'Linke', 'SPD', 'Union'], dtype=object)

In [4]:
# Encoding the labels
encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df["encode_lab"] = df["label"].apply(lambda x: encode_cat(x))

df.tail()

Unnamed: 0,text,label,date,encode_lab
28106,Die Vorschläge des Sachverständigenrats sind...,Union,2019-07-12,5
28107,âWir mÃ¼ssen vorsichtig sein mit Pauschalau...,Union,2019-07-15,5
28108,âUrsula von der Leyens Wahl zur Kommissions...,Union,2019-07-16,5
28109,âDie Entscheidung fÃ¼r die -Bundesvorsitzen...,Union,2019-07-17,5
28110,âBei der Beratung der VorschlÃ¤ge fÃ¼r eine...,Union,2019-07-18,5


In [5]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
TEST_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-german-cased')

In [6]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [7]:
class PPRs(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = str(self.data.text[index])
        #text = " ".join(text.split())We could omit whitespace
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.encode_lab[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [8]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=123)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

valid_size = 0.2
valid_dataset = df.sample(frac=valid_size, random_state=123)
train_dataset = df.drop(valid_dataset.index).reset_index(drop=True)
valid_dataset = valid_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VALIDATION Dataset: {}".format(valid_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = PPRs(train_dataset, tokenizer, MAX_LEN)
validation_set = PPRs(valid_dataset, tokenizer, MAX_LEN)
testing_set = PPRs(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (28111, 4)
TRAIN Dataset: (22489, 4)
VALIDATION Dataset: (5622, 4)
TEST Dataset: (5622, 4)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **valid_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-german-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [11]:
model = DistilBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [12]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [13]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [14]:
# Defining the training function on the 60% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [15]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 1.8680355548858643
Training Accuracy per 5000 steps: 0.0
Training Loss per 5000 steps: 0.2106647894131975
Training Accuracy per 5000 steps: 93.42131573685263
The Total Accuracy for Epoch 0: 93.95259904842368
Training Loss Epoch: 0.1945740482391241
Training Accuracy Epoch: 93.95259904842368
Training Loss per 5000 steps: 0.01329158153384924
Training Accuracy per 5000 steps: 100.0
Training Loss per 5000 steps: 0.03295347671548801
Training Accuracy per 5000 steps: 99.08018396320736
The Total Accuracy for Epoch 1: 99.07955000222331
Training Loss Epoch: 0.03256145468435288
Training Accuracy Epoch: 99.07955000222331
Training Loss per 5000 steps: 0.0009782195556908846
Training Accuracy per 5000 steps: 100.0
Training Loss per 5000 steps: 0.01740860895651511
Training Accuracy per 5000 steps: 99.51009798040391
The Total Accuracy for Epoch 2: 99.5108719818578
Training Loss Epoch: 0.017637187697917923
Training Accuracy Epoch: 99.5108719818578


In [44]:
def valid(model, validation_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

In [45]:


print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)



This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss per 100 steps: 0.00015424482990056276
Validation Accuracy per 100 steps: 100.0




Validation Loss Epoch: 0.006493519316566665
Validation Accuracy Epoch: 99.85770188545001
Accuracy on test data = 99.86%


In [None]:

# Saving the files for re-use

output_model_file = './models/pytorch_distilbert_optimized.bin'
output_vocab_file = './models/vocab_distilbert_optimized.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')