In [1]:
from transformers import T5TokenizerFast, T5EncoderModel
import torch
import os
import numpy as np
from pathlib import Path
import time
import pandas as pd

In [2]:
def naming(full_path):
    return full_path.split("/")[-1]

In [3]:
def text2t5(data_path, local_models_at, huggface_models = [], device = "cpu"):
    
    t0 = time.time()
    
    data_path = Path(data_path)

    if local_models_at == None:
        models = huggface_models
    else:
        models_path = Path(local_models_at)
        models = [f"{local_models_at}/{model}" for model in os.listdir(local_models_at)] + huggface_models
    
    for dwe in os.listdir(data_path):
        for meaning in ["ingroup", "outgroup"]:
            for rnd in ["first_round", "second_round"]:
                isExist = os.path.exists(data_path / dwe / meaning / rnd / "vectors")
                if not isExist:
                    os.makedirs(data_path / dwe / meaning / rnd / "vectors")   
                
                replacements = pd.read_csv(data_path / dwe / meaning / rnd / "replacements.txt", sep = "\t", index_col = 0) 
                # remove punctuation? "deportera" vs "deportera."
                
                for model in models:
                    t = time.time()
                    path = data_path / dwe / meaning / rnd / "vectors" / naming(model)
                    
                    isExist = os.path.exists(path)
                    if not isExist:
                        os.makedirs(path)   
                    print()
                    print(f"{dwe:<15}{meaning:<10}{rnd:<15}{naming(model)}")

                    ########################################################
                    tokenizer = T5TokenizerFast.from_pretrained(model, model_max_length=512)
                    T5 = T5EncoderModel.from_pretrained(model)
                    T5.to(device)
                    ########################################################
                    vectors = []
                    for idx, line in zip(replacements.index, replacements.iloc[:,0]):
                        pcent = round((len(replacements.loc[:idx]) / len(replacements)) * 100, 1)
                        print(f"{pcent:<10}{int((time.time()-t))} s.", end="\r")
                        ##########################################################################
                        encoded = tokenizer.encode_plus(line, return_tensors="pt", truncation=True, max_length=512)
                        encoded.to(device)

                        with torch.no_grad():
                            output = T5.encoder(
                                input_ids=encoded["input_ids"], 
                                attention_mask=encoded["attention_mask"], 
                                return_dict=True
                            )
                    
                        last_hidden = output.last_hidden_state.squeeze()
                        vector = torch.mean(last_hidden, dim=0) # TAKE THE MEAN OF ALL INPUTS OF LAST LAYER ... see paper by Ni et al. 2021 "Sentence-T5"
                        ##########################################################################
                        as_str = " ".join([str(value) for value in vector.tolist()])
                        vectors.append(f"{idx}\t{as_str}\n")
                    
                    with open(path / "vecs.txt", mode = "w") as f:
                        for vec in vectors:
                            f.write(vec)
    
    print()
    t = time.time()
    print("Done!", int((t-t0)/60), "m.", int((t-t0)%60), "s.")

In [4]:
text2t5(
    data_path = Path("../data/replacements/data/"), 
    local_models_at = None, 
    huggface_models = ["google-t5/t5-3b"], # ["google-t5/t5-small"] ['KBLab/bert-base-swedish-cased'], 
    device = "cuda"
)


aterinvandring ingroup   first_round    t5-3b
100.0     19 s.
aterinvandring ingroup   second_round   t5-3b
100.0     29 s.
aterinvandring outgroup  first_round    t5-3b
100.0     29 s.
aterinvandring outgroup  second_round   t5-3b
100.0     77 s.
berikar        ingroup   first_round    t5-3b
100.0     11 s.
berikar        ingroup   second_round   t5-3b
100.0     16 s.
berikar        outgroup  first_round    t5-3b
100.0     34 s.
berikar        outgroup  second_round   t5-3b
100.0     90 s.
forortsgang    ingroup   first_round    t5-3b
100.0     12 s.
forortsgang    ingroup   second_round   t5-3b
100.0     23 s.
forortsgang    outgroup  first_round    t5-3b
100.0     33 s.
forortsgang    outgroup  second_round   t5-3b
100.0     87 s.
globalister    ingroup   first_round    t5-3b
100.0     7 s.
globalister    ingroup   second_round   t5-3b
100.0     12 s.
globalister    outgroup  first_round    t5-3b
100.0     30 s.
globalister    outgroup  second_round   t5-3b
100.0     83 s.
Done! 10

In [5]:
#where_my_models_are = "/home/max/Models/sentenceBERT"

In [6]:
# text2bert(
#     data_path = Path("/home/max/Results/rplc_globalist_2nd_round/data"), 
#     local_models_at = where_my_models_are, 
#     huggface_models = ["KBLab/sentence-bert-swedish-cased"], 
# )