# KB-BERT

In [None]:
import os
import numpy as np
from dotenv import load_dotenv, find_dotenv
import sys
import torch
from torch import nn

sys.path.append(os.path.dirname(find_dotenv()))

In [None]:
#Import the file_handler.py file
from py_scripts.file_handler import read_csv_file

#Read the data
X, Y = read_csv_file("clean.csv")

In [None]:
import py_scripts.ner_util.ner_system as ner_util

## Defining the model

In [None]:
from transformers import AutoTokenizer, AutoModel

kb_tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
kb_model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')

In [None]:
class KB_BERT(nn.Module):
    def __init__(self, seq_labeler):
        super().__init__() 

        p = seq_labeler.params
        self.bert = kb_model

        # Output unit.
        self.top_layer = nn.Linear(self.bert.config.hidden_size, seq_labeler.n_labels)

    def forward(self, words):
        outputs = self.bert(words)
        res = outputs[0]
        return self.top_layer(res)

In [None]:
class NERParameters():

    # Random seed, for reproducibility.
    random_seed = 0
    
    # cuda or cpu
    device = 'cpu'
                
    # NB: this hyperparameter is only used if we are training the embedding
    # model from scratch.
    word_emb_dim = 128
    
    # Whether or not to fine-tune the word embedding model.
    finetune_word_emb = False

    # Training parameters
    n_epochs = 20
    batch_size = 32   
    learning_rate = 0.00008
    weight_decay = 0

    # Word dropout rate.
    word_dropout_prob = 0.0

    # Set the following to True to enable character tensors.
    use_characters = False

    bert_max_len = 256

    bert_model = False

In [None]:
!wget https://raw.githubusercontent.com/lucasros98/files/main/train_1177_clean.csv
!wget https://raw.githubusercontent.com/lucasros98/files/main/val_1177_clean.csv

In [None]:
X_train_1177_clean, Y_train_1177_clean = read_csv_file("train_1177_clean")
X_val_1177_clean, Y_val_1177_clean = read_csv_file("val_1177_clean")

In [None]:
ner_system = ner_util.SequenceLabeler(NERParameters(), KB_BERT, bert_tokenizer=AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased'))

ner_system.fit(X_train_1177_clean, Y_train_1177_clean, X_val_1177_clean, Y_val_1177_clean)