# Train our Philosophical Document Embedding Model

In [1]:
import torch

import loader
import models
import utility


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
source_label_mapping = {
    "aurelius_meditations.txt": 0,
    "beauvoir_ethicsambiguity.txt": 1,
    "effectivealtruism.txt": 2,
    "effectivealtruism_copy.txt": 2,
    "emerson_selfreliance.txt": 3,
    "kierkegaard_presentage.txt": 4,
    "thoreau_walden.txt": 5,
    "epicurus_lettermenoeceus.txt": 6,
    "epicurus_lettermenoeceus_copy.txt": 6
}
philthis_label_mapping = {
    "philthis_beauvoir.txt": 1,
    "philthis_effectivealtruism.txt": 2,
    "philthis_emerson.txt": 3,
    "philthis_epicurus.txt": 6,
    "philthis_kierkegaard.txt": 4,
    "philthis_stoicism.txt": 0,
    "philthis_thoreau.txt": 5,
}
gpt_label_mapping = {
    "gpt_beauvoir.txt": 1,
    "gpt_effectivealtruism.txt": 2,
    "gpt_emerson.txt": 3,
    "gpt_epicureanism.txt": 6,
    "gpt_kierkegaard.txt": 4,
    "gpt_stoicism.txt": 0,
    "gpt_thoreau.txt": 5,
}

In [3]:
loader_params = {
    "num_labels": 7,
    "batch_size": 4,
    "chunk_size": 1000,
    "chunk_overlap": 10,
    "balance_multiplier": 2,
}

train_folders = ["data/train_data", "data/test_data_pt"]
test_folders = ["data/test_data_gpt"]
train_label_mapping = source_label_mapping | philthis_label_mapping
test_label_mapping = gpt_label_mapping

train_dataloader = loader.get_dataloader(train_folders, loader_params, train_label_mapping, balance_data=True, print_info=False)
test_dataloader = loader.get_dataloader(test_folders, loader_params, test_label_mapping, balance_data=False, print_info=False)

Dataloader from ('['data/train_data', 'data/test_data_pt']') created with 246 embeddings in 7.8 seconds.
Dataloader from ('['data/test_data_gpt']') created with 19 embeddings in 1.3 seconds.


In [4]:
model_params = {
    "input_dim": 384,
    "hidden_dim": 128,
    "output_dim": loader_params['num_labels'],
    "dropout": 0.35
}
train_params = {
    "epochs": 10,
    "learning_rate": 1e-3
}



model = models.Classifier(**model_params)
optim = torch.optim.Adam(model.parameters(), lr=train_params['learning_rate'])
model.to(device)

print(f"Epochs  || Train Loss | Train Acc || Test Loss | Test Acc")
for epoch in range(train_params['epochs']):
    train_loss = []
    num_correct, num_total = 0, 0
    for iter, (embeddings, labels) in enumerate(train_dataloader):
        embeddings, labels = embeddings.to(device), labels.to(device)        
        optim.zero_grad()
        logits = model(embeddings)
        
        loss, correct_preds = utility.compute_loss_preds(logits, labels)
        loss.backward()
        optim.step()

        num_correct += correct_preds
        num_total += labels.shape[0]
        train_loss.append(loss.cpu().item())

    eval_loss, eval_acc = utility.evaluate_model(model, test_dataloader, device)
    train_loss = torch.tensor(train_loss)
    print(f" {epoch+1:<2}/{train_params['epochs']:>3} ||    {train_loss.mean():.3f}   |   {num_correct/num_total:.3f}   ||   {eval_loss:.3f}   |  {eval_acc:.3f}  ")

Epochs  || Train Loss | Train Acc || Test Loss | Test Acc
 1 / 10 ||    1.603   |   0.406   ||   1.018   |  0.644  
 2 / 10 ||    0.765   |   0.740   ||   0.419   |  0.932  
 3 / 10 ||    0.533   |   0.817   ||   0.338   |  0.918  
 4 / 10 ||    0.408   |   0.860   ||   0.374   |  0.890  
 5 / 10 ||    0.319   |   0.895   ||   0.217   |  0.918  
 6 / 10 ||    0.251   |   0.919   ||   0.192   |  0.945  
 7 / 10 ||    0.211   |   0.934   ||   0.228   |  0.904  
 8 / 10 ||    0.167   |   0.939   ||   0.261   |  0.904  
 9 / 10 ||    0.136   |   0.962   ||   0.201   |  0.904  
 10/ 10 ||    0.097   |   0.982   ||   0.314   |  0.877  
