In [1]:
from transformers import AutoModel, AutoTokenizer, BertModel, BertTokenizer
from codalab_utils.get_names import Names
from codalab_utils.get_data import GetData
from tqdm import tqdm
import torch

In [2]:
names = Names()
device = torch.device('cuda')

In [3]:
# model_name = names.select_bert_model()
saved_model = names.models_path + names.select_saved_model('hf')

	0 : polyhope_spanish_multiclass_bc
	1 : pret_bert_cased_noemoji
	2 : poly_eng_noemoji_bbc
	3 : polyhope_spanish_multiclass_bc_2
	4 : beto_uncased_multi_noemoji


Select model :  2


In [4]:
model = AutoModel.from_pretrained(saved_model).to(device)
tokenizer = AutoTokenizer.from_pretrained(saved_model)

Some weights of BertModel were not initialized from the model checkpoint at /home/coep/general/bert/codalab/models/poly_eng_noemoji_bbc and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
file_name = names.train_path + names.select_file('train')
data = GetData(file_name, tokenizer)
text_dataloader = data.get_text_dataloader()

	0 : hopeedi_train.csv
	1 : train_polyhope_spanish_cleaned.csv
	2 : train_polyhope_spanish_cleaned_noemoji.csv
	3 : train_polyhope_english_cleaned.csv
	4 : train_polyhope_spanish.csv
	5 : train_polyhope_english.csv
	6 : train_polyhope_english_cleaned_noemoji.csv


Select train file :  6


In [6]:
embeds = []
model.eval()
with torch.no_grad():
    for batch in tqdm(text_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # embeds.extend(torch.mean(outputs.last_hidden_state, dim=1))
        # print(outputs.last_hidden_state[0].shape)
        # print(torch.mean(outputs.last_hidden_state, dim=1).shape)
        embeds.extend(outputs.pooler_output)
        # print(len(outputs.pooler_output))


100%|█████████████████████████████████████████| 387/387 [02:56<00:00,  2.20it/s]


In [13]:
res = torch.stack(embeds, dim=0)
res.shape

torch.Size([6192, 768])

In [14]:
test_data = GetData(file_name.replace('train', 'test'), tokenizer)
test_dataloader = test_data.get_text_dataloader()

test_embeds = []
model.eval()
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # test_embeds.extend(torch.mean(outputs.last_hidden_state, dim=1))
        test_embeds.extend(outputs.pooler_output)

res_test = torch.stack(test_embeds, dim=0)

100%|███████████████████████████████████████████| 65/65 [00:28<00:00,  2.31it/s]


In [15]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [16]:
df_test = pd.read_csv(file_name.replace('train', 'test'))
test_bin = df_test['binary'].tolist()
test_mul = df_test['multiclass'].tolist()

df_train = pd.read_csv(file_name)
train_bin = df_train['binary'].tolist()
train_mul = df_train['multiclass'].tolist()

polyhope_binary_labels = {'Hope':1, 'Not Hope':0}
polyhope_inv_binary_labels = {v: k for k, v in polyhope_binary_labels.items()}

polyhope_multi_labels = {'Not Hope':0, 'Generalized Hope':1, 'Realistic Hope':2, 'Unrealistic Hope':3}
polyhope_inv_multi_labels = {v: k for k, v in polyhope_multi_labels.items()}

y_bin_train = [polyhope_binary_labels[i] for i in train_bin]
y_mul_train = [polyhope_multi_labels[i] for i in train_mul]

y_bin_test = [polyhope_binary_labels[i] for i in test_bin]
y_mul_test = [polyhope_multi_labels[i] for i in test_mul]

In [17]:
X = res.cpu().numpy()

clf = SVC()
clf.fit(X, y_mul_train)

In [18]:
clf.score(res_test.cpu().numpy(), y_mul_test)

0.6162790697674418