In [1]:
#from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
import torch
import os
import numpy as np
from pathlib import Path
import time
import pandas as pd

In [2]:
#os.listdir(Path("/home/max/Models/sentenceBERT"))

In [3]:
#os.listdir("/home/max/Results/replacements/data/") + []

In [4]:
def naming(full_path):
    return full_path.split("/")[-1]

In [5]:
def text2bert(data_path, local_models_at, huggface_models = [], device = "cpu"):
    
    t0 = time.time()
    
    data_path = Path(data_path)

    if local_models_at == None:
        models = huggface_models
    else:
        models_path = Path(local_models_at)
        models = [f"{local_models_at}/{model}" for model in os.listdir(local_models_at)] + huggface_models
    
    for dwe in os.listdir(data_path):
        for meaning in ["ingroup", "outgroup"]:
            for rnd in ["first_round", "second_round"]:
                isExist = os.path.exists(data_path / dwe / meaning / rnd / "vectors")
                if not isExist:
                    os.makedirs(data_path / dwe / meaning / rnd / "vectors")   
                
                replacements = pd.read_csv(data_path / dwe / meaning / rnd / "replacements.txt", sep = "\t", index_col = 0) 
                # remove punctuation? "deportera" vs "deportera."
                
                for model in models:
                    
                    t = time.time()
                    path = data_path / dwe / meaning / rnd / "vectors" / naming(model)
                    
                    isExist = os.path.exists(path)
                    if not isExist:
                        os.makedirs(path)   
                    print()
                    print(f"{dwe:<15}{meaning:<10}{rnd:<15}{naming(model)}")

                    ########################################################
                    #sBERT = SentenceTransformer(model)
                    tokenizer = AutoTokenizer.from_pretrained(model)
                    BERT = AutoModel.from_pretrained(model)
                    BERT.to(device)
                    ########################################################
                    vectors = []
                    for idx, line in zip(replacements.index, replacements.iloc[:,0]):
                        pcent = round((len(replacements.loc[:idx]) / len(replacements)) * 100, 1)
                        print(f"{pcent:<10}{int((time.time()-t))} s.", end="\r")
                        ##########################################################################
                        #vector = sBERT.encode(line)
                        encoded = tokenizer.encode_plus(line, return_tensors="pt", truncation=True, max_length=512)
                        encoded.to(device)
                        with torch.no_grad():
                            output = BERT(**encoded)
                        last_hidden = output.last_hidden_state.squeeze()
                        vector = last_hidden[0] # CLS  
                        ##########################################################################
                        as_str = " ".join([str(value) for value in vector.tolist()])
                        vectors.append(f"{idx}\t{as_str}\n")
                    
                    with open(path / "vecs.txt", mode = "w") as f:
                        for vec in vectors:
                            f.write(vec)
    
    print()
    print("Done!", int((time.time()-t0)/60), "m.")

In [6]:
#'KB/bert-base-swedish-cased'
# OR: 'KBLab/bert-base-swedish-cased'
# C:\Users\xbohma\Desktop\work\work\data\replacments\data

# https://kb-labb.github.io/posts/2022-03-16-evaluating-swedish-language-models/

text2bert(
    data_path = Path("../data/replacements/data/"), 
    local_models_at = None, 
    huggface_models = ["KBLab/megatron-bert-large-swedish-cased-110k"], # ['KBLab/bert-base-swedish-cased'], 
    device = "cuda"
)


aterinvandring ingroup   first_round    megatron-bert-large-swedish-cased-110k
100.0     11 s.
aterinvandring ingroup   second_round   megatron-bert-large-swedish-cased-110k
100.0     20 s.
aterinvandring outgroup  first_round    megatron-bert-large-swedish-cased-110k
100.0     18 s.
aterinvandring outgroup  second_round   megatron-bert-large-swedish-cased-110k
100.0     55 s.
berikar        ingroup   first_round    megatron-bert-large-swedish-cased-110k
100.0     5 s.
berikar        ingroup   second_round   megatron-bert-large-swedish-cased-110k
100.0     10 s.
berikar        outgroup  first_round    megatron-bert-large-swedish-cased-110k
100.0     24 s.
berikar        outgroup  second_round   megatron-bert-large-swedish-cased-110k
100.0     66 s.
forortsgang    ingroup   first_round    megatron-bert-large-swedish-cased-110k
100.0     5 s.
forortsgang    ingroup   second_round   megatron-bert-large-swedish-cased-110k
100.0     15 s.
forortsgang    outgroup  first_round    megatron-be

In [7]:
#where_my_models_are = "/home/max/Models/sentenceBERT"

In [8]:
# text2bert(
#     data_path = Path("/home/max/Results/rplc_globalist_2nd_round/data"), 
#     local_models_at = where_my_models_are, 
#     huggface_models = ["KBLab/sentence-bert-swedish-cased"], 
# )