In [28]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

# Load the tokenizer and the pre-trained BERT model for Icelandic
tokenizer = BertTokenizer.from_pretrained("m3hrdadfi/icelandic-ner-bert")
model = BertForTokenClassification.from_pretrained("m3hrdadfi/icelandic-ner-bert", num_labels=9, ignore_mismatched_sizes=True)

# Define the labels used by the model
labels = [
    "O",         # Outside of a named entity
    "B-PER",     # Beginning of a person's name
    "I-PER",     # Inside of a person's name
    "B-LOC",     # Beginning of a location's name
    "I-LOC",     # Inside of a location's name
    "B-ORG",     # Beginning of an organization's name
    "I-ORG",     # Inside of an organization's name
    "B-MISC",    # Beginning of a miscellaneous entity's name
    "I-MISC"     # Inside of a miscellaneous entity's name
]

# Load the book text
with open("bnsis.txt", "r", encoding="utf-8") as f:
    book_text = f.read()

# Tokenize the book text
tokens = tokenizer.tokenize(book_text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Create attention masks
attention_masks = [1] * len(input_ids)

# Convert inputs to PyTorch tensors
input_ids = torch.tensor([input_ids])
attention_masks = torch.tensor([attention_masks])

# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(input_ids, attention_masks)

# Get the predicted labels
predicted_labels = torch.argmax(outputs[0], dim=2).squeeze()

# Extract named characters
characters = []
current_character = ""
inside_character = False

for token, label in zip(tokens, predicted_labels):
    label = label.item()
    if label == 1 or label == 2:  # Beginning or inside a person's name
        if not inside_character:
            inside_character = True
            current_character += token.replace("##", "")
        else:
            current_character += " " + token.replace("##", "")
    else:
        if inside_character:
            characters.append(current_character)
            current_character = ""
            inside_character = False

# Print the extracted named characters
for character in characters:
    print(character)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at m3hrdadfi/icelandic-ner-bert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([17, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([17]) in the checkpoint and torch.Size([9]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: The expanded size of the tensor (203959) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 203959].  Tensor sizes: [1, 512]

In [31]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

# Load the tokenizer and the pre-trained BERT model for Icelandic
tokenizer = BertTokenizer.from_pretrained("m3hrdadfi/icelandic-ner-bert")
model = BertForTokenClassification.from_pretrained("m3hrdadfi/icelandic-ner-bert", num_labels=9)

# Define the labels used by the model
labels = [
    "O",         # Outside of a named entity
    "B-PER",     # Beginning of a person's name
    "I-PER",     # Inside of a person's name
    "B-LOC",     # Beginning of a location's name
    "I-LOC",     # Inside of a location's name
    "B-ORG",     # Beginning of an organization's name
    "I-ORG",     # Inside of an organization's name
    "B-MISC",    # Beginning of a miscellaneous entity's name
    "I-MISC"     # Inside of a miscellaneous entity's name
]

# Load the book text
with open("bnsis.txt", "r", encoding="utf-8") as f:
    book_text = f.read()

# Split the book text into smaller segments
segment_length = 512  # Maximum sequence length supported by BERT
segments = [book_text[i:i+segment_length] for i in range(0, len(book_text), segment_length)]

# Create a list to store the named characters
characters = []

# Process each text segment
for segment in segments:
    # Tokenize the segment
    tokens = tokenizer.tokenize(segment)

    # Convert tokens to IDs
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Create attention mask
    attention_mask = [1] * len(input_ids)

    # Pad sequences to the same length
    padding_length = segment_length - len(input_ids)
    input_ids += [0] * padding_length
    attention_mask += [0] * padding_length

    # Convert inputs to PyTorch tensors
    input_ids = torch.tensor([input_ids])
    attention_mask = torch.tensor([attention_mask])

    # Set the model to evaluation mode
    model.eval()

    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)

    # Get the predicted labels
    predicted_labels = torch.argmax(outputs[0], dim=2).squeeze()

    # Extract named characters
    current_character = ""
    inside_character = False

    for token, label in zip(tokens, predicted_labels):
        label = label.item()
        if label == 1 or label == 2:  # Beginning or inside a person's name
            if not inside_character:
                inside_character = True
                current_character += token.replace("##", "")
            else:
                current_character += " " + token.replace("##", "")
        else:
            if inside_character:
                characters.append(current_character)
                current_character = ""
                inside_character = False

# Print the extracted named characters
for character in characters:
    print(character)


RuntimeError: Error(s) in loading state_dict for BertForTokenClassification:
	size mismatch for classifier.weight: copying a param with shape torch.Size([17, 768]) from checkpoint, the shape in current model is torch.Size([9, 768]).
	size mismatch for classifier.bias: copying a param with shape torch.Size([17]) from checkpoint, the shape in current model is torch.Size([9]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [21]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-win_amd64.whl (3.5 MB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [4]:
python -m spacy download is_core_news_sm

SyntaxError: invalid syntax (926975567.py, line 1)

In [6]:
import requests

# URL of the GreynirServer API
url = 'http://localhost:8000/api/v1/process'

# Read the text document
with open('bnsis.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Define the request payload
data = {
    'greynir': text,
    'parse_tree': True,
    'relations': True,
}

# Send the request
response = requests.post(url, json=data)

# Check the response status
if response.status_code == 200:
    # Extract the named characters and relations from the response
    result = response.json()
    named_characters = result['entities']
    relations = result['relations']
    # Process the named characters and relations as needed
    print(named_characters)
    print(relations)
else:
    print('Error:', response.status_code)


ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /api/v1/process (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002471F7F3C40>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [9]:
pip install greynir


Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement greynir (from versions: none)
ERROR: No matching distribution found for greynir


In [10]:
pip install reynir

Collecting reynir
  Downloading reynir-3.5.3-cp39-cp39-win_amd64.whl (427 kB)
Collecting cffi>=1.15.1
  Downloading cffi-1.15.1-cp39-cp39-win_amd64.whl (179 kB)
Collecting tokenizer>=3.4.2
  Downloading tokenizer-3.4.2-py2.py3-none-any.whl (79 kB)
Collecting islenska>=0.4.3
  Downloading islenska-0.4.6-cp39-cp39-win_amd64.whl (45.7 MB)
Installing collected packages: cffi, tokenizer, islenska, reynir
  Attempting uninstall: cffi
    Found existing installation: cffi 1.15.0
    Uninstalling cffi-1.15.0:
      Successfully uninstalled cffi-1.15.0
Successfully installed cffi-1.15.1 islenska-0.4.6 reynir-3.5.3 tokenizer-3.4.2
Note: you may need to restart the kernel to use updated packages.


In [12]:
import reynir
from reynir import Greynir


In [15]:
with open('bnsis.txt', 'r', encoding='utf-8') as file:
    text = file.read()


In [16]:
parser = Greynir()
parsed = parser.parse(text)


In [17]:
named_entities = parsed.named_entities
characters = [ne.text for ne in named_entities if ne.label == 'person']


AttributeError: 'dict' object has no attribute 'named_entities'

In [None]:
relations = parsed.relations
