In [1]:
#from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
import torch
import os
import numpy as np
from pathlib import Path
import time
import pandas as pd

In [4]:
def naming(full_path):
    return full_path.split("/")[-1]

In [5]:
def text2bert(data_path, local_models_at, huggface_models = [], device = "cpu", pooling = "cls", use_suffix = False):
    
    t0 = time.time()
    if use_suffix:
        suffix = f"-{pooling}"
        
    data_path = Path(data_path)

    if local_models_at == None:
        models = huggface_models
    else:
        models_path = Path(local_models_at)
        models = [f"{local_models_at}/{model}" for model in os.listdir(local_models_at)] + huggface_models
    
    for dwe in os.listdir(data_path):
        for meaning in ["ingroup", "outgroup"]:
            for rnd in ["first_round", "second_round"]:
                isExist = os.path.exists(data_path / dwe / meaning / rnd / "vectors")
                if not isExist:
                    os.makedirs(data_path / dwe / meaning / rnd / "vectors")   
                
                replacements = pd.read_csv(data_path / dwe / meaning / rnd / "replacements.txt", sep = "\t", index_col = 0) 
                # remove punctuation? "deportera" vs "deportera."
                
                for model in models:
                    
                    t = time.time()
                    path = data_path / dwe / meaning / rnd / "vectors" / f"{naming(model)}{suffix}"
                    
                    isExist = os.path.exists(path)
                    if not isExist:
                        os.makedirs(path)   
                    print()
                    print(f"{dwe:<15}{meaning:<10}{rnd:<15}{naming(model)}")

                    ########################################################
                    tokenizer = AutoTokenizer.from_pretrained(model)
                    BERT = AutoModel.from_pretrained(model)
                    BERT.to(device)
                    ########################################################
                    vectors = []
                    for idx, line in zip(replacements.index, replacements.iloc[:,0]):
                        pcent = round((len(replacements.loc[:idx]) / len(replacements)) * 100, 1)
                        print(f"{pcent:<10}{int((time.time()-t))} s.", end="\r")
                        ##########################################################################
                        encoded = tokenizer.encode_plus(line, return_tensors="pt", truncation=True, max_length=512)
                        encoded.to(device)
                        with torch.no_grad():
                            output = BERT(**encoded)
                        last_hidden = output.last_hidden_state.squeeze()

                        if pooling == "cls":
                            vector = last_hidden[0] # CLS
                        elif pooling == "avg":
                            vector = torch.mean(last_hidden, dim=0)
                        else:
                            return "No pooling method!!"
                        ##########################################################################
                        as_str = " ".join([str(value) for value in vector.tolist()])
                        vectors.append(f"{idx}\t{as_str}\n")
                    
                    with open(path / "vecs.txt", mode = "w") as f:
                        for vec in vectors:
                            f.write(vec)
    
    print()
    print("Done!", int((time.time()-t0)/60), "m.")

In [6]:
text2bert(
    data_path = Path("../data/replacements/data/"), 
    local_models_at = None, 
    huggface_models = ['KBLab/bert-base-swedish-cased'], 
    device = "cuda",
    pooling = "avg", 
    use_suffix = True
)


aterinvandring ingroup   first_round    bert-base-swedish-cased
100.0     7 s.
aterinvandring ingroup   second_round   bert-base-swedish-cased
100.0     12 s.
aterinvandring outgroup  first_round    bert-base-swedish-cased
100.0     10 s.
aterinvandring outgroup  second_round   bert-base-swedish-cased
100.0     29 s.
berikar        ingroup   first_round    bert-base-swedish-cased
100.0     3 s.
berikar        ingroup   second_round   bert-base-swedish-cased
100.0     7 s.
berikar        outgroup  first_round    bert-base-swedish-cased
100.0     13 s.
berikar        outgroup  second_round   bert-base-swedish-cased
100.0     33 s.
forortsgang    ingroup   first_round    bert-base-swedish-cased
100.0     4 s.
forortsgang    ingroup   second_round   bert-base-swedish-cased
100.0     7 s.
forortsgang    outgroup  first_round    bert-base-swedish-cased
100.0     11 s.
forortsgang    outgroup  second_round   bert-base-swedish-cased
100.0     32 s.
globalister    ingroup   first_round    bert