In [1]:
import os
import pandas as pd
import torch
import numpy as np
from torch import nn

from transformers import BertTokenizer
from torch.utils.data import DataLoader, SubsetRandomSampler
from sklearn.model_selection import KFold
from sklearn.metrics import pairwise_distances

import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, 'app')

from utils import set_seed, init_model
from siamese_bert import ContrastiveLoss, SiameseNetWorkSentenceDataset, train_loop_siamese, SiameseBERTNet, pwdist_siamese_bert

2022-02-21 18:26:30.133247: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-21 18:26:30.133264: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# SIAMESEBERT /  BERTCLASSIF

In [None]:
set_seed(1024)
DEVICE = torch.device("cpu")

OUT_DIR = os.path.join("data/test/SiameseBERT-no-train")
if not os.path.isdir(OUT_DIR):
    os.makedirs(OUT_DIR)

TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
BATCH_SIZE = 128
MODEL_PARAMS = dict({'freeze_embedding': True, 'freeze_encoder_layer': 8, 'freeze_cls_pooler': True})

TEST = pd.read_csv("data/test/test_set.csv", index_col=False)
DATATEST = SiameseNetWorkSentenceDataset(data=TEST, tokenizer=TOKENIZER, max_length=64)
TEST_LOADER = DataLoader(DATATEST, BATCH_SIZE, shuffle=True)

print("Compute predictions for best-model on validation set")
MODEL = SiameseBERTNet(**MODEL_PARAMS)
MODEL.load_state_dict(torch.load("data/SiameseBERT/batch128_m10_training/best-model.pt"))
MODEL.to(DEVICE)
pwdist_siamese_bert(MODEL, TEST_LOADER, DEVICE, OUT_DIR)

In [None]:
from bert_cls_classif import BERTSentencesClassificationDataset, BERTSentencesClassification, bert_sentences_classification_prediction

In [None]:
set_seed(1024)
DEVICE = torch.device("cpu")

OUT_DIR = os.path.join("data/test/BERT-classif")
if not os.path.isdir(OUT_DIR):
    os.makedirs(OUT_DIR)

TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
BATCH_SIZE = 128
MODEL_PARAMS = dict({'freeze_embedding': True, 'freeze_encoder_layer': 8, 'freeze_cls_pooler': True})

TEST = pd.read_csv("data/test/test_set.csv", index_col=False)
DATATEST = BERTSentencesClassificationDataset(data=TEST, tokenizer=TOKENIZER, max_length=64)
TEST_LOADER = DataLoader(DATATEST, BATCH_SIZE, shuffle=True)

print("Compute predictions for best-model on validation set")
MODEL = BERTSentencesClassification(**MODEL_PARAMS)
MODEL.load_state_dict(torch.load("data/BERTSentenseClassification/batch128/best-model.pt"))
MODEL.to(DEVICE)
bert_sentences_classification_prediction(MODEL, TEST_LOADER, DEVICE, OUT_DIR)

# Prepare Siamese BERT result for PCA analysis

In [None]:
from siamese_bert import SiameseNetWorkSentenceDataset, SiameseBERTNet


def test_pca(model, test, device, out_dir, tokenizer):
    
    out_dim = model.hidden_size

    # set the model to eval:
    model.eval()

    emb_q1 = np.array([]).reshape(0, out_dim)
    emb_q2 = np.array([]).reshape(0, out_dim)
    y_list = []
    s1 = []
    s2 = []
    # Iterate over testing batchs
    for step, batch in enumerate(test):

        # Get batch data
        input_ids_q1 = batch[0]['input_ids'].to(device)
        attention_mask_q1 = batch[0]['attention_mask'].to(device)
        input_ids_q2 = batch[1]['input_ids'].to(device)
        attention_mask_q2 = batch[1]['attention_mask'].to(device)
        y = batch[2].to(device)

        # Apply model
        out_q1, out_q2 = model(input_ids_q1, attention_mask_q1, input_ids_q2, attention_mask_q2)
        
        s1 += [tokenizer.decode(input_ids, skip_special_tokens=True) for input_ids in input_ids_q1]
        s2 += [tokenizer.decode(input_ids, skip_special_tokens=True) for input_ids in input_ids_q2]

        emb_q1 = np.concatenate([emb_q1, out_q1.detach().numpy()])
        emb_q2 = np.concatenate([emb_q2, out_q2.detach().numpy()])

        y_list += y.tolist()

    pd.DataFrame(emb_q1).to_csv(os.path.join(out_dir, "emb_q1.csv"), index=False, header=False)
    pd.DataFrame(emb_q2).to_csv(os.path.join(out_dir, "emb_q2.csv"), index=False, header=False)
    pd.DataFrame({"y": y_list}).to_csv(os.path.join(out_dir, "y.csv"), index=False, sep =',')
    pd.DataFrame({"s1": s1, "s2": s2}).to_csv(os.path.join(out_dir, "setences.csv"), index=False, sep ='\t')

DEVICE = torch.device("cpu")
TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
BATCH_SIZE=128

test_set = pd.read_csv("data/test/test_set.csv", index_col=False)
pos_eg = test_set[test_set["is_duplicate"] == 1].sample(frac=1).head(n=10)
neg_eg = test_set[test_set["is_duplicate"] == 0].sample(frac=1).head(n=10)
new_test_set = pd.concat([pos_eg, neg_eg])
new_test_set.reset_index(drop = True, inplace = True)

DATATEST = SiameseNetWorkSentenceDataset(data=new_test_set, tokenizer=TOKENIZER, max_length=64)
LOADER = DataLoader(DATATEST, BATCH_SIZE, shuffle=True)

MODEL = SiameseBERTNet()
MODEL.to(DEVICE)
test_pca(MODEL, LOADER, DEVICE, "data/SiameseBERT/batch128_m10_training/pca-data/no-train", TOKENIZER)


MODEL = SiameseBERTNet()
MODEL.load_state_dict(torch.load("data/SiameseBERT/batch128_m10_training/best-model.pt"))
MODEL.to(DEVICE)
test_pca(MODEL, LOADER, DEVICE, "data/SiameseBERT/batch128_m10_training/pca-data/train", TOKENIZER)

# TEST neighbourhood

In [None]:
def test_output_vectors(model, test, device, out_dir, tokenizer):
    
    out_dim = model.hidden_size

    # set the model to eval:
    model.eval()

    emb_q1 = np.array([]).reshape(0, out_dim)
    emb_q2 = np.array([]).reshape(0, out_dim)
    y_list = []
    s1 = []
    s2 = []
    # Iterate over testing batchs
    for step, batch in enumerate(test):

        print(f"step: {step +1}")

        # Get batch data
        input_ids_q1 = batch[0]['input_ids'].to(device)
        attention_mask_q1 = batch[0]['attention_mask'].to(device)
        input_ids_q2 = batch[1]['input_ids'].to(device)
        attention_mask_q2 = batch[1]['attention_mask'].to(device)
        y = batch[2].to(device)

        # Apply model
        out_q1, out_q2 = model(input_ids_q1, attention_mask_q1, input_ids_q2, attention_mask_q2)
        
        s1 += [tokenizer.decode(input_ids, skip_special_tokens=True) for input_ids in input_ids_q1]
        s2 += [tokenizer.decode(input_ids, skip_special_tokens=True) for input_ids in input_ids_q2]

        emb_q1 = np.concatenate([emb_q1, out_q1.detach().numpy()])
        emb_q2 = np.concatenate([emb_q2, out_q2.detach().numpy()])

        y_list += y.tolist()
    
    all_questions_emb = np.concatenate([emb_q1, emb_q2])
    all_sentences = s1 + s2

    return all_questions_emb, all_sentences, y_list


test_set = pd.read_csv("data/test/test_set.csv", index_col=False)
new_test_set = test_set.sample(frac=1).head(n=1000)
new_test_set.reset_index(drop=True, inplace=True)

DEVICE = torch.device("cpu")
TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
BATCH_SIZE=128
DATATEST = SiameseNetWorkSentenceDataset(data=new_test_set, tokenizer=TOKENIZER, max_length=64)
LOADER = DataLoader(DATATEST, BATCH_SIZE, shuffle=True)

MODEL = SiameseBERTNet()
MODEL.load_state_dict(torch.load("data/SiameseBERT/batch128_m10_training/best-model.pt"))
MODEL.to(DEVICE)
emb_matrix, all_s, y = test_output_vectors(MODEL, LOADER, DEVICE, "data/SiameseBERT/batch128_m10_training/pca-data/train", TOKENIZER)

pwdist = pairwise_distances(emb_matrix, metric='euclidean')

out_dir = "data/SiameseBERT/batch128_m10_training/neighbourhood_data/n1000"
pd.DataFrame(pwdist).to_csv(os.path.join(out_dir, "emb_questions.csv"), index=False, header=False)
pd.DataFrame({"question": all_s}).to_csv(os.path.join(out_dir, "sentences.csv"), index=False, sep ='\t')
pd.DataFrame({"y": y}).to_csv(os.path.join(out_dir, "y.csv"), index=False, sep =',')
