# KB-BERT

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
import sys
from torch import nn

sys.path.append(os.path.dirname(find_dotenv()))
load_dotenv(find_dotenv())

## Getting the data

In [None]:
#Import the file_handler
from py_scripts.file_handler import read_csv_file

#Import the NER system
import py_scripts.ner_util.ner_system as ner_util

In [None]:
#Load data 
X, Y = read_csv_file("clean.csv")

In [None]:
from sklearn.model_selection import train_test_split

#Ratio of train, validation and test
train_ratio = 0.8
validation_ratio = 0.10
test_ratio = 0.10

#Random state - For reproducibility
random_state=104

#Split data into train, validation and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1-train_ratio, random_state=random_state)
X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, test_size=test_ratio/(test_ratio+validation_ratio), random_state=random_state)

#Get the precentage of the data that should be used for training
try:
    precentage = float(float(sys.argv[1])/100) if len(sys.argv) > 1 and sys.argv[1] != "None" else 1.0
except:
    print("Error occured while parsing the precentage from the sys args. Please check the sys args.")
    precentage = 1.0

X_train = X_train[:int(len(X_train)*precentage)]
Y_train = Y_train[:int(len(Y_train)*precentage)]

print("Using " + str(precentage*100) + "% of the data for training.")

In [None]:
#Print the length of the data
print("Length of the data:")
print("Train: " + str(len(X_train)))
print("Validation: " + str(len(X_val)))
print("Test: " + str(len(X_test)))

## Defining the model

In [None]:
from transformers import AutoTokenizer, AutoModel

kb_tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
kb_model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')

In [None]:
class KB_BERT(nn.Module):
    def __init__(self, seq_labeler):
        super().__init__() 

        p = seq_labeler.params
        self.bert = kb_model

        # Output unit.
        self.top_layer = nn.Linear(self.bert.config.hidden_size, seq_labeler.n_labels)

    def forward(self, words):
        outputs = self.bert(words)
        res = outputs[0]
        return self.top_layer(res)

### Defining NER Parameters

In [None]:
#Import NER parameters from parameters.py
from parameters import NERParameters

params = NERParameters()

#Update the parameters if needed
params.tagging_scheme = "IO"


## Finetuning BERT model

In [None]:
ner_system = ner_util.SequenceLabeler(params, KB_BERT, bert_tokenizer=AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased'))

ner_system.fit(X_train, Y_train, X_val, Y_val)

## Evaluation of the system

Evaluate the sytem on the test data.

In [None]:
ner_system.evaluate_model(X_test,Y_test)

In [None]:
#Print some examples