In [1]:
from sentence_transformers import SentenceTransformer
from utils import *
import os
from copy import deepcopy
from sklearn.metrics import accuracy_score
from torch.nn import functional as F

VALID_DATASETS = ["20ng", "ohsumed", "R8", "R52", "mr", "SST1", "SST2", "TREC", "WebKB"]

  from tqdm.autonotebook import tqdm, trange


In [2]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
set_seed(42)

In [4]:
def load_configs(dataset):
    if dataset == "R8":
        output_dim = 8
    elif dataset == "R52":
        output_dim = 52
    elif dataset == "ohsumed":
        output_dim = 23
    elif dataset == "mr":
        output_dim = 2
    elif dataset == "TREC":
        output_dim = 6
    elif dataset == "WebKB":
        output_dim = 8
    elif dataset == "SST1":
        output_dim = 5
    elif dataset == "SST2":
        output_dim = 2
    elif dataset == "20ng":
        output_dim = 20

    return output_dim

In [5]:
data_dir = "data/"
dataset_name = "TREC"
bsz = 64
x_dim, hidden_dim, out_dim = 1024, 256, load_configs(dataset_name)


In [6]:
def get_data(dataset_name):
    cleaned_dir = "data/cleaned/"
    train_test_info_dir = "data/train_test_info/"
    if dataset_name not in VALID_DATASETS:
        raise Exception(
            "dataset not valid.\nsupported datasets {accepted_datasets}"
        )

    # Read dataset and embedding files
    cleaned_dataset = dataset_name + ".clean"
    dataset = read_file(cleaned_dir, cleaned_dataset)
    train_test_info = read_file(train_test_info_dir, dataset_name)

    return dataset, train_test_info

In [7]:
cur_graph_dir = os.path.join(data_dir, dataset_name)
dataset, train_test_info = get_data(dataset_name)

In [8]:
# check if splits are cached
cur_graph_dir = f"{data_dir}graphs/{dataset_name}/{dataset_name}"
train_ids_filename = cur_graph_dir + ".train_ids"
test_ids_filename = cur_graph_dir + ".test_ids"
cached_ids_available = exists(train_ids_filename) and exists(test_ids_filename)

# Get training and test information
doc_name_list = []
doc_train_list = []
doc_test_list = []
for tti in train_test_info:
    doc_name_list.append(tti.strip())

    if not cached_ids_available:
        # if splits are not cached -> gotta build from scratch
        temp = tti.split()
        if temp[1].find("train") != -1:
            doc_train_list.append(tti.strip())
        if temp[1].find("test") != -1:
            doc_test_list.append(tti.strip())

if cached_ids_available:
    # If cached files are available -> load them
    # to avoid different splits for the same dataset
    print("loading cached id files to build graph...")
    with open(train_ids_filename, "rb") as f:
        train_ids = pkl.load(f)
    with open(test_ids_filename, "rb") as f:
        test_ids = pkl.load(f)
else:
    print("missing id file(s). building from scratch...")
    train_ids = []
    for train_name in doc_train_list:
        train_id = doc_name_list.index(train_name)
        train_ids.append(train_id)
    random.shuffle(train_ids)

    test_ids = []
    for test_name in doc_test_list:
        test_id = doc_name_list.index(test_name)
        test_ids.append(test_id)
    random.shuffle(test_ids)

    # caching ids
    with open(train_ids_filename, "wb") as f:
        pkl.dump(train_ids, f)
    with open(test_ids_filename, "wb") as f:
        pkl.dump(test_ids, f)

# shuffle dataset
ids = train_ids + test_ids
shuffled_doc_name_list = []
shuffled_dataset = []
for id in ids:
    shuffled_doc_name_list.append(doc_name_list[int(id)])
    shuffled_dataset.append(dataset[int(id)])

# Get labels
y_unmapped = []
for doc_meta in shuffled_doc_name_list:
    temp = doc_meta.split("\t")
    y_unmapped.append(temp[2])
y_map = {label:i for i, label in enumerate(set(y_unmapped))}

y = []
for label in y_unmapped:
    y.append(y_map[label])

train_size = len(train_ids)
val_size = int(0.1 * train_size)
real_train_size = train_size - val_size

ids = train_ids + test_ids
masks_train = ids[0:real_train_size]
masks_val = ids[real_train_size:real_train_size+val_size]
masks_test = ids[train_size:]

loading cached id files to build graph...


In [9]:
assert len(shuffled_dataset) == len(y)

In [10]:
sbert = SentenceTransformer("intfloat/multilingual-e5-large", device="cuda:5")

In [11]:
embs = []
for doc in shuffled_dataset:
    embs.append(sbert.encode(doc))
assert len(embs) == len(shuffled_dataset)
embs = torch.tensor(np.array(embs))
y = torch.tensor(y, dtype=torch.long)

In [12]:
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __init__(self, list1, list2):
        assert len(list1) == len(list2)
        self.list1 = list1
        self.list2 = list2

    def __len__(self):
        return len(self.list1)

    def __getitem__(self, idx):
        return self.list1[idx], self.list2[idx]

In [13]:
train_embs = embs[masks_train]
train_y = y[masks_train]
train_data = CustomDataset(train_embs, train_y)

val_embs = embs[masks_val]
val_y = y[masks_val]
val_data = CustomDataset(val_embs, val_y)

test_embs = embs[masks_test]
test_y = y[masks_test]
test_data = CustomDataset(test_embs, test_y)

In [14]:
from torch import nn
class MLP(nn.Module):
    def __init__(self, x_dim, hidden_dim, out_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(x_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim)
        )

    def forward(self, x):
        return self.layers(x)
    
mlp = MLP(x_dim, hidden_dim, out_dim)

In [15]:
from torch.utils.data.dataloader import DataLoader
train_loader = DataLoader(train_data, batch_size=bsz)
val_loader = DataLoader(val_data, batch_size=bsz)
test_loader = DataLoader(test_data, batch_size=bsz)

In [16]:
len(train_loader), len(val_loader), len(test_loader)

(100, 12, 56)

In [17]:
def train(model, train_loader, optimizer, device):
    model.train()
    model.to(device)
    
    total_examples, total_loss = 0, 0
    y_true, y_pred = [], []
    for inputs, labels in train_loader:
        batch_size = len(labels)
        
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = F.cross_entropy(outputs, labels)
        
        optimizer.zero_grad()    
        loss.backward()
        optimizer.step()
        
        preds = F.softmax(outputs, dim=-1)
        preds = preds.argmax(dim=-1)
        y_pred.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

        total_examples += batch_size
        total_loss += float(loss.item()) * batch_size
    train_loss = total_loss / total_examples
    train_acc = accuracy_score(y_true, y_pred)

    return train_loss, train_acc

In [18]:
def evaluate(model, eval_loader, device):
    model.eval()
    model.to(device)
    
    total_examples, total_loss = 0, 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in eval_loader:
            batch_size = len(labels)

            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = F.cross_entropy(outputs, labels)

            preds = F.softmax(outputs, dim=-1)
            preds = preds.argmax(dim=-1)
            y_pred.extend(preds.cpu().numpy())
            y_true.extend(labels.cpu().numpy())

            total_examples += batch_size
            total_loss += float(loss.item()) * batch_size
        eval_loss = total_loss / total_examples
        eval_acc = accuracy_score(y_true, y_pred)
    
    return eval_loss, eval_acc

In [19]:
def experiment(mlp, train_loader, val_loader, test_loader):
    device = "cuda:5"
    opt = torch.optim.Adam(mlp.parameters(), lr = 0.001)
    patience, max_epochs = 30, 200
    
    patience_count, best_loss = 0, 10**10
    for epoch in range(1, max_epochs + 1):
        if patience_count == patience:
            break

        loss, acc = train(mlp, train_loader, opt, device)
        val_loss, val_acc = evaluate(mlp, val_loader, device)

        if epoch == 1 or epoch % 10 == 0:
            print(
                f"Epoch {epoch}, Loss: {loss:.6f}, Acc: {acc:.4f}, "
                f"Val Loss: {val_loss:.6f}, Val Acc: {val_acc:.4f}"
            )

        if val_loss < best_loss:
            best_loss = val_loss
            best_model = deepcopy(mlp.state_dict())
        else:
            patience_count += 1

    mlp.load_state_dict(best_model)
    test_loss, test_acc = evaluate(mlp, test_loader, device)
    
    return test_loss, test_acc

In [20]:
n_runs = 10
test_accs = []
for run in range(1, n_runs + 1):
    test_loss, test_acc = experiment(mlp, train_loader, val_loader, test_loader)
    test_accs.append(test_acc)

    print(f"Run: {run}, Test Loss: {test_loss}, Test Acc: {test_acc}")

import numpy as np
print(f"{np.mean(test_accs) * 100:.2f}$\pm${np.std(test_accs) * 100:.2f}")

Epoch 1, Loss: 0.419979, Acc: 0.8228, Val Loss: 0.286761, Val Acc: 0.8775
Epoch 10, Loss: 0.246713, Acc: 0.8997, Val Loss: 0.282993, Val Acc: 0.8831
Epoch 20, Loss: 0.208085, Acc: 0.9206, Val Loss: 0.305821, Val Acc: 0.8789
Epoch 30, Loss: 0.181538, Acc: 0.9283, Val Loss: 0.348903, Val Acc: 0.8761
Run: 1, Test Loss: 0.27679231908795, Test Acc: 0.8840742824985931
Epoch 1, Loss: 0.255419, Acc: 0.8959, Val Loss: 0.279190, Val Acc: 0.8817
Epoch 10, Loss: 0.213784, Acc: 0.9173, Val Loss: 0.297080, Val Acc: 0.8746
Epoch 20, Loss: 0.155347, Acc: 0.9408, Val Loss: 0.350317, Val Acc: 0.8690
Epoch 30, Loss: 0.141069, Acc: 0.9433, Val Loss: 0.438325, Val Acc: 0.8676
Run: 2, Test Loss: 0.27555079099899876, Test Acc: 0.8885762521102982
Epoch 1, Loss: 0.251403, Acc: 0.8978, Val Loss: 0.279882, Val Acc: 0.8803
Epoch 10, Loss: 0.202592, Acc: 0.9222, Val Loss: 0.299315, Val Acc: 0.8761
Epoch 20, Loss: 0.158192, Acc: 0.9379, Val Loss: 0.367420, Val Acc: 0.8732
Epoch 30, Loss: 0.118906, Acc: 0.9569, Val 