In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import datasets
import json

In [2]:
!PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [3]:
base_model_name ="xlm-roberta-base"
model_name = "/scratch/project_2009199/pytorch-registerlabeling/models/xlm-roberta-large/labels_upper/en-fi-fr-sv-tr_en-fi-fr-sv-tr/seed_42/fold_1"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
lang="zh"

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [5]:
#! huggingface-cli login

In [6]:
dataset = datasets.load_dataset("TurkuNLP/register_oscar", data_files={lang:f'{lang}/{lang}_00000.jsonl.gz'}, cache_dir="/scratch/project_2009199/cache")



In [7]:

dataset = dataset.filter(lambda example, idx: idx % 36 == 0, with_indices=True)
print(dataset)

Filter:   0%|          | 0/146611 [00:00<?, ? examples/s]

DatasetDict({
    zh: Dataset({
        features: ['id', 'labels', 'text'],
        num_rows: 4073
    })
})


In [8]:
labels = np.array(["MT","LY","SP","ID","NA","HI","IN","OP","IP"])
def predict(d,lang):
    #print(d["text"])
    #print(tokenizer(d["text"], return_tensors='pt', truncation='only_first'))
    #return d
    with torch.no_grad():
        output = model(d["encoded"]["input_ids"].to(device), output_hidden_states=True)
    logits = output["logits"].cpu()
    pred = [labels[i] for i in np.where(logits > 0.5)[1]]
    hidden_states = output["hidden_states"]
    indices = np.array([0, len(hidden_states)//2, -1], dtype=int)
    embed = [torch.mean(hidden_states[i],axis=1).cpu().tolist() for i in indices]
    torch.cuda.empty_cache()
    return {"id": d["id"], "lang":lang, "labels": d["labels"],"preds": pred, "embed_first":embed[0], "embed_half":embed[1], "embed_last":embed[2]}
    #return [d["id"],d["labels"],pred,embed]


def tokenize(d):
    return tokenizer(d["text"], return_tensors='pt', truncation=True)


#text = "Moday 20th October a bad car crash on highway 7. More at..."
#text = dataset["en"][0]["text"]
#predict(text)

In [9]:
dataset = dataset.map(lambda line: {"encoded": tokenize(line)})
dataset = dataset.with_format("torch")

Map:   0%|          | 0/4073 [00:00<?, ? examples/s]

In [10]:
#dataset = dataset.map(lambda line: predict(line)).remove_columns(["text", "encoded"])
#with open("testi_embeds.jsonl", "w") as outfile:
#    for d in dataset["en"]:
#        outfile.write(json.dumps(predict(d)))
#with open("testi_embeds.jsonl", "w") as file:
#    for d in dataset["en"]:
#        json.dump(predict(d), file)
results = []
for d in dataset[lang]:
    results.append(predict(d,lang))

In [11]:
import pandas as pd

In [12]:
df = pd.DataFrame(results)

In [13]:
df.to_csv(lang+"_embeds.tsv", sep="\t", header=True)