# Import libraries

In [None]:
import pickle
import torch
from transformers import BertModel
from transformers import BertJapaneseTokenizer, BertTokenizer
import random
import pandas as pd
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
from codecarbon import EmissionsTracker

# Define variables

In [None]:
MODEL_NAME = "cl-tohoku/bert-base-japanese"

MAX_SEQ_LEN = 512
BATCH_SIZE = 10

EPOCHS = 3

SEED = 42

DATA_DIR = "test"
MODEL_DIR = "models"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Randomize seed

In [None]:
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Load model and weight

In [None]:
config = BertConfig.from_pretrained(MODEL_NAME,num_labels=2, output_hidden_states = True)

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.load_state_dict(torch.load(f"{MODEL_DIR}/model_{EPOCHS}.pth"))

model.to(DEVICE)

model.eval()

tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

# Validate single text

In [None]:
text = "いじめられている、もう本当に死にたいんだ。"

In [None]:
inputs = tokenizer.encode_plus(text,
                               add_special_tokens = True,
                               return_attention_mask = True, 
                               padding = "max_length",
                               max_length = MAX_SEQ_LEN,
                               return_tensors='pt'
)

inputs.to(DEVICE)

with torch.no_grad():
    outputs = model(**inputs)


In [None]:
outputs.logits

In [None]:
softmax = nn.Softmax(dim=1)
pred = softmax(outputs.logits)

In [None]:
pred.cpu().numpy()[0][1]

In [None]:
outputs[1][-2].cpu().numpy()[0]

# Validate multi texts

In [None]:
input_ids = []
attention_masks = []
labels = []

df = pd.read_csv(f"{DATA_DIR}/test.csv", lineterminator="\n").dropna()

encoded_data_test = tokenizer.batch_encode_plus(
    df["Text"].values,
    add_special_tokens = True, 
    return_attention_mask = True, 
    padding = "max_length", 
    max_length = MAX_SEQ_LEN, 
    return_tensors = 'pt',
    truncation = True
)

input_ids = encoded_data_test['input_ids']
attention_masks = encoded_data_test['attention_mask']
labels = torch.tensor(df["Label"].values)

test_dataset = TensorDataset(input_ids, attention_masks, labels)
test_dataloader = DataLoader(
            test_dataset,
            shuffle=False, 
            batch_size = BATCH_SIZE,
)

# Pred class

In [None]:
pred = []
true = []

for batch in test_dataloader:
    batch = tuple(b.to(DEVICE) for b in batch)
    
    b_input_ids = batch[0]
    b_input_mask = batch[1]
    b_labels = batch[2]

    with torch.no_grad():
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        pred.append(output)
        true.append(b_labels)

# Visualize pred result

In [None]:
softmax = nn.Softmax(dim=1)

for i in range(len(pred)):
    df_logits = pd.DataFrame(pred[i][0].cpu().numpy(), columns={"logits0", "logits1"})
    
    df_pred = pd.DataFrame(np.argmax(pred[i][0].cpu().numpy(), axis=1), columns={"pred_label"})
    df_label = pd.DataFrame(true[i].cpu().numpy(), columns={"true_label"})
    df_tmp = pd.concat([df_logits, df_pred, df_label], axis=1)
    
    if i==0:
        df_result = df_tmp.copy()
    else:
        df_result = pd.concat([df_tmp, df_result], axis=0, ignore_index=True)

In [None]:
df_result

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error

print(classification_report(df_result["true_label"], df_result["pred_label"]))

mae = mean_absolute_error(df_result["true_label"], df_result["pred_label"])
print('MAE : {:.3f}'.format(mae))

# mapping data

In [None]:
vectors = []

for i in range(len(pred)):
    #print(pred[i][1][-2].cpu().numpy()[0])
    if i == 0:
        vectors = pred[i][1][-2].cpu().numpy()[0].reshape(-1,768)
    else:
        vectors = vectors + pred[i][1][-2].cpu().numpy()[0].reshape(-1,768)

In [None]:
vectors

In [None]:
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE

In [None]:
#t-SNE
tsne = TSNE(n_components=2, random_state=0)

X_tsne = tsne.fit_transform(vectors)

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)

ax.scatter(X_tsne[0:10:, 0], X_tsne[0:10, 1], label='P')
ax.scatter(X_tsne[10:20, 0], X_tsne[10:20, 1], label='N')

ax.set_title('Pred result t-SNE', size=16)

ax.legend(loc='best', fontsize=14)

plt.show()