In [7]:
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import asyncio
import logging
import json
import re

In [2]:
logging.disable(logging.WARNING)

async def fetch_entry(edition, article, session, debug=False):
    url = f'https://nordiskfamiljebok.dh.gu.se/article/{edition}/{article}'
    
    response = await session.get(url)
    await response.html.arender(sleep=5) 
    
    if debug:
        print(f"Successfully fetched {'1st' if article == 1 else f'{article}nd'} article from {'1st' if edition == 1 else f'{edition}nd'} edition")

    return response.html

def load_html(path='articles_html'):
    try:
        with open(path + '.json', 'r', encoding='utf-8') as file:
            return json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        # If file is not found or is empty/corrupt, create an empty file
        with open(path + '.json', 'w', encoding='utf-8') as file:
            json.dump([], file, ensure_ascii=False, indent=4)
        return []

In [None]:
def load_json(path):
    
    try:
        with open(path + '.json', 'r', encoding='utf-8') as file:
            return json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        with open(path + '.json', 'w', encoding='utf-8') as file:
            json.dump([], file, ensure_ascii=False, indent=4)
        return []

def find_missing_articles(edition, articles, path='articles_html'):

    entries = load_json(path)
    entry_keys = {(entry["edition"], entry["article"]) for entry in entries}

    return { article for article in set(articles) if (edition, article) not in entry_keys}

async def fetch_missing_entries(edition, articles, path='articles_html'):
    
    missing_articles = find_missing_articles(edition, articles, path)
    session = AsyncHTMLSession()
    
    async with asyncio.TaskGroup() as task_group:
        tasks = [task_group.create_task(fetch_entry(edition, article, session)) for article in missing_articles]

    entries = []
    
    for (article, task) in zip(missing_articles, tasks):
        
        html = task.result().html
        soup = BeautifulSoup(html, "html.parser")
        
        entry = {
            'edition': edition,
            'article': article,
            'html': html,
            'text': soup.find('article', {'data-v-5ad7308b': True}).decode_contents()
        }
        
        entries.append(entry)
    
    return entries

async def load_articles(edition, articles, path='articles_html'):
    
    missing_entries = await fetch_missing_entries(edition, articles, path)
    entries = []
    
    with open(path + '.json', 'r', encoding='utf-8') as file:
        entries = load_json(path)
        
    entries.extend(missing_entries)
    
    with open(path + '.json', 'w', encoding='utf-8') as file:
        json.dump(entries, file, ensure_ascii=False, indent=4)
        
    return entries

async def load_segments(edition, articles, root='articles'):
    
    entries = load_json(root + '_segments')
    entry_keys = {(entry["edition"], entry["article"]) for entry in entries}

    loaded_articles = await load_articles(edition, articles, root + '_html')

    print(loaded_articles)
        
edition = 2
articles = range(1, 6)

loaded_articles = await load_articles(edition, articles)
await load_segments(edition, articles)

In [41]:
import unicodedata
import locale

locale.setlocale(locale.LC_COLLATE, 'sv_SE.UTF-8')

def segment_text(text):

    segments = re.split(r"\s*(?:<br\s*/?>)+\s*", text)

    last_valid_segment = None
    last_valid_word = None
    valid_segments = {}

    for segment in segments:
        # segment = unicodedata.normalize('NFC', segment)
        clean_segment = re.sub(r"<.*?>", '', segment).strip().lower()

        try:
            first_word = re.match(r"\b(\w[\w']*)", clean_segment).group(0)  # Match the first word
            first_word_uml = re.sub(r'ii', 'ü', first_word)
        except:
            first_word = first_word_uml = ''

        if last_valid_segment:

            if (last_valid_word < first_word or last_valid_word < first_word_uml
                and (re.match(r"[\w'-]+[\s]*[.,\(\[]", clean_segment))):
                valid_segments[last_valid_word] = last_valid_segment
                last_valid_segment = clean_segment
                last_valid_word = first_word
                #print("NEW -", "HEAD:", first_word, "SEGMENT:", clean_segment)

            else:
                last_valid_segment += '\n' + clean_segment
                #print("OLD -", "HEAD:", first_word, "SEGMENT:", clean_segment)
        else:
            #print("NEW -", "HEAD:", first_word, "SEGMENT:", clean_segment)
            last_valid_segment = clean_segment
            last_valid_word = first_word

    valid_segments[last_valid_word] = last_valid_segment
    
    return valid_segments

In [None]:
entries = []

for article in articles:
    
    text = article['text']
    
    segments = segment_text(text)
    

In [36]:
segments = segment_article(articles[0]['text'])
segments_text = list(segments.values())


NEW - HEAD: tryckluftverktyg SEGMENT: tryckluftverktyg, mek., små af komprimerad luft drifna, för hand manövrerade maskiner, som med fördel användas för ytbearbetning af gjutna metaller. de pneumatiska handhamrarna l. tryckluftmejslarna (se hammare, sp. 1242) äro anordnade på i hufvudsak samma sätt som de mindre bergborrmaskinerna (se borrmaskiner, sp. 1226), men skilja sig från dem därigenom, att skäret har annan form; det utgöres helt enkelt af en rätlinig mot mejselstången vinkelrät egg. dessa tryckluftmejslar begagnas bl. a. vid förberedande bearbetning af stålgöt för bortskaffande af otätheter vid ytan, hvarigenom vid götens påföljande utsmidning eller utvalsning ett tätare fabrikat vinnes. dessa verktyg komma ock till användning vid rensning af gjutgods och tjäna därvid att bortskaffa gjutgrader och andra framspringande ojämnheter. -- ett tryckluftverktyg af helt annat slag är sandblästern (se d. o.). o. e. w.
NEW - HEAD: tryckmaskiner SEGMENT: tryckmaskiner. se tryckpress och ty

In [37]:
for segment in segments.items():
    print(segment, '\n')

('tryckluftverktyg', 'tryckluftverktyg, mek., små af komprimerad luft drifna, för hand manövrerade maskiner, som med fördel användas för ytbearbetning af gjutna metaller. de pneumatiska handhamrarna l. tryckluftmejslarna (se hammare, sp. 1242) äro anordnade på i hufvudsak samma sätt som de mindre bergborrmaskinerna (se borrmaskiner, sp. 1226), men skilja sig från dem därigenom, att skäret har annan form; det utgöres helt enkelt af en rätlinig mot mejselstången vinkelrät egg. dessa tryckluftmejslar begagnas bl. a. vid förberedande bearbetning af stålgöt för bortskaffande af otätheter vid ytan, hvarigenom vid götens påföljande utsmidning eller utvalsning ett tätare fabrikat vinnes. dessa verktyg komma ock till användning vid rensning af gjutgods och tjäna därvid att bortskaffa gjutgrader och andra framspringande ojämnheter. -- ett tryckluftverktyg af helt annat slag är sandblästern (se d. o.). o. e. w.') 

('tryckmaskiner', 'tryckmaskiner. se tryckpress och tygtryck.') 

('tryckmetamorfr

In [72]:
from transformers import AutoModel,AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')

# Using TF models
#model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')

In [111]:
encoded_text = tokenizer(list(segments.values())[0])
print("Encoding", encoded_text)

tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print("Tokens:", tokens)

print("Toekens to string:", tokenizer.convert_tokens_to_string(tokens))

print("Vocabulary size:", tokenizer.vocab_size)
print("Maxicum context size:", tokenizer.model_max_length)

print("Tokenizer model names", tokenizer.model_input_names)

def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True)

Encoding {'input_ids': [2, 22279, 20, 495, 2965, 7, 3585, 477, 734, 79, 496, 19, 2965, 7, 177, 5591, 776, 31, 134, 19576, 16096, 447, 7, 126, 6641, 19, 67, 38347, 4815, 39, 1685, 790, 1778, 22279, 328, 140, 19, 137, 5591, 10346, 1968, 171, 19, 3026, 19, 67, 836, 6, 19, 6610, 7025, 24678, 4845, 45976, 5, 31, 440, 264, 10712, 955, 1310, 653, 243, 7310, 36, 11033, 16413, 4652, 19, 181, 48, 82, 17358, 737, 195, 7947, 4815, 6519, 177, 377, 15, 7, 17, 7, 19, 196, 7, 38716, 171, 36, 29465, 1395, 5, 177, 377, 18070, 171, 776, 31, 42074, 63, 25021, 1468, 400, 49808, 170, 100, 21490, 2368, 19, 67, 68, 137, 264, 845, 692, 23472, 372, 140, 244, 3860, 3690, 264, 17185, 40938, 31, 10222, 7744, 6343, 177, 377, 38741, 171, 7, 82, 28302, 2693, 43, 59, 22279, 20, 1258, 54, 97, 29047, 15847, 43926, 19, 1739, 31, 59, 38741, 102, 9311, 32, 6598, 29133, 243, 470, 7505, 49795, 18050, 147, 7, 36554, 21420, 899, 25097, 1756, 54, 100, 59, 5028, 76, 26938, 2412, 390, 76, 15372, 19, 134, 19576, 2784, 102, 1676, 1

In [None]:
encoded_segments = tokenize(segments_text)
encoded_segments

3

In [135]:
from transformers import AutoModelForSequenceClassification

num_labels = 3

model2 = (AutoModelForSequenceClassification
         .from_pretrained('KB/bert-base-swedish-cased', num_labels=num_labels))

In [116]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}


In [145]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(encoded_segments)# // batch_size
model_name = f"bert-finetuned-segments"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True, 
                                  log_level="error")

from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=segments_encoded["train"],
                  eval_dataset=segments_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train()



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`