# Train our Philosophical Document Embedding Model

In [4]:
import torch

import loader
import models
import utility


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
filename_category_mapping, category_label_mapping = loader.load_labeling_mappings()
filename_label_mapping = filename_category_mapping
for key, value in filename_label_mapping.items():
    filename_label_mapping[key] = int(category_label_mapping[value])

In [None]:
loader_params = {
    "num_labels": len(category_label_mapping),
    "batch_size": 4,
    "chunk_size": 2500,
    "chunk_overlap": 100,
    "balance_multiplier": 1,
}

model_params = {
    "input_dim": 384,
    "hidden_dim": 64,
    "output_dim": loader_params['num_labels'],
    "dropout": 0.35
}
train_params = {
    "epochs": 10,
    "learning_rate": 1e-3
}

train_folders = ["wikipedia", "philosophize_this_transcripts", "gpt_summaries", "gpt_philosophy_game"]
# train_folders = ["base_texts", "wikipedia", "philosophize_this_transcripts", "gpt_summaries", "gpt_philosophy_game"]
test_folders = ["eval_data"]
filename_category_mapping, category_label_mapping = loader.load_labeling_mappings()


train_dataloader = loader.get_dataloader(train_folders, loader_params, filename_label_mapping, balance_data=True, print_info=False)
test_dataloader = loader.get_dataloader(test_folders, loader_params, filename_label_mapping, balance_data=True, print_info=False)

model = models.Classifier(**model_params)
optim = torch.optim.Adam(model.parameters(), lr=train_params['learning_rate'])
model.to(device)

print(f"Epochs  || Train Loss | Train Acc || Test Loss | Test Acc")
for epoch in range(train_params['epochs']):
    train_loss = []
    num_correct, num_total = 0, 0
    for iter, (embeddings, labels) in enumerate(train_dataloader):
        embeddings, labels = embeddings.to(device), labels.to(device)        
        optim.zero_grad()
        logits = model(embeddings)
        
        loss, correct_preds = utility.compute_loss_preds(logits, labels)
        loss.backward()
        optim.step()

        num_correct += correct_preds
        num_total += labels.shape[0]
        train_loss.append(loss.cpu().item())

    eval_loss, eval_acc = utility.evaluate_model(model, test_dataloader, device)
    train_loss = torch.tensor(train_loss)
    print(f" {epoch+1:<2}/{train_params['epochs']:>3} ||    {train_loss.mean():.3f}   |   {num_correct/num_total:.3f}   ||   {eval_loss:.3f}   |  {eval_acc:.3f}  ")
        
model.to("cpu")
model.save_model()

Dataloader from ('['data\\wikipedia', 'data\\philosophize_this_transcripts', 'data\\gpt_summaries', 'data\\gpt_philosophy_game']') created with 95 batches in 3.6 seconds.
Dataloader from ('['data\\eval_data']') created with 7 batches in 1.3 seconds.
Epochs  || Train Loss | Train Acc || Test Loss | Test Acc
 1 / 15 ||    1.942   |   0.175   ||   1.931   |  0.143  
 2 / 15 ||    1.861   |   0.349   ||   1.818   |  0.500  
 3 / 15 ||    1.588   |   0.537   ||   1.621   |  0.357  
 4 / 15 ||    1.263   |   0.632   ||   1.521   |  0.429  
 5 / 15 ||    1.032   |   0.698   ||   1.305   |  0.643  
 6 / 15 ||    0.892   |   0.728   ||   1.305   |  0.536  
 7 / 15 ||    0.775   |   0.757   ||   1.382   |  0.429  
 8 / 15 ||    0.684   |   0.775   ||   1.301   |  0.536  
 9 / 15 ||    0.611   |   0.815   ||   1.393   |  0.464  
 10/ 15 ||    0.570   |   0.820   ||   1.406   |  0.429  
 11/ 15 ||    0.509   |   0.849   ||   1.513   |  0.464  
 12/ 15 ||    0.470   |   0.849   ||   1.578   |  0.53