# Train our Philosophical Document Embedding Model

In [1]:
import torch

import loader
import models
import utility


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
filename_category_mapping, category_label_mapping = loader.load_labeling_mappings()
filename_label_mapping = filename_category_mapping
for key, value in filename_label_mapping.items():
    filename_label_mapping[key] = int(category_label_mapping[value])

In [None]:
loader_params = {
    "num_labels": len(category_label_mapping),
    "batch_size": 4,
    "chunk_size": 2000,
    "chunk_overlap": 100,
    "balance_multiplier": 1.5,
}

model_params = {
    "input_dim": 384,
    "hidden_dim": 64,
    "output_dim": loader_params['num_labels'],
    "dropout": 0.35
}
train_params = {
    "epochs": 10,
    "learning_rate": 1e-3
}

# train_splits = {
    # "source_texts": ["base_texts"],
    # "human_summaries": ["wikipedia", "philosophize_this_transcripts"],
    # "gpt_data": ["gpt_summaries", "gpt_philosophy_game"],
    # "all_human_texts": ["base_texts", "wikipedia", "philosophize_this_transcripts"],
    # "all_summaries": ["wikipedia", "philosophize_this_transcripts", "gpt_summaries", "gpt_philosophy_game"],
    # "all_sources": ["base_texts", "wikipedia", "philosophize_this_transcripts", "gpt_summaries", "gpt_philosophy_game"],
# }

eval_losses = {
    "2000": [],
    "5000": [],
    "8000": [],
} 

train_folders = ["base_texts"]
test_folders = ["eval_data"]
filename_category_mapping, category_label_mapping = loader.load_labeling_mappings()

for t in range(10):
    for chunk_size, _ in eval_losses.items():
        loader_params['chunk_size'] = int(chunk_size)
        train_dataloader = loader.get_dataloader(train_folders, loader_params, filename_label_mapping, balance_data=True, print_info=False)
        test_dataloader = loader.get_dataloader(test_folders, loader_params, filename_label_mapping, balance_data=False, print_info=False)

        model = models.Classifier(**model_params)
        optim = torch.optim.Adam(model.parameters(), lr=train_params['learning_rate'])
        model.to(device)

        print(f"Training model on '{chunk_size}' chunk size")
        print(f"Epochs  || Train Loss | Train Acc || Test Loss | Test Acc")
        for epoch in range(train_params['epochs']):
            train_loss = []
            num_correct, num_total = 0, 0
            for iter, (embeddings, labels) in enumerate(train_dataloader):
                embeddings, labels = embeddings.to(device), labels.to(device)        
                optim.zero_grad()
                logits = model(embeddings)
                
                loss, correct_preds = utility.compute_loss_preds(logits, labels)
                loss.backward()
                optim.step()

                num_correct += correct_preds
                num_total += labels.shape[0]
                train_loss.append(loss.cpu().item())

            eval_loss, eval_acc = utility.evaluate_model(model, test_dataloader, device)
            train_loss = torch.tensor(train_loss)
            print(f" {epoch+1:<2}/{train_params['epochs']:>3} ||    {train_loss.mean():.3f}   |   {num_correct/num_total:.3f}   ||   {eval_loss:.3f}   |  {eval_acc:.3f}  ")
        
        eval_losses[chunk_size].append((eval_loss, eval_acc))

Dataloader from ('['data\\base_texts']') created with 121 batches in 11.0 seconds.
Dataloader from ('['data\\eval_data']') created with 14 batches in 1.6 seconds.
Training model on '2000' chunk size
Epochs  || Train Loss | Train Acc || Test Loss | Test Acc
 1 / 10 ||    1.919   |   0.228   ||   1.920   |  0.189  
 2 / 10 ||    1.667   |   0.412   ||   1.856   |  0.132  
 3 / 10 ||    1.194   |   0.594   ||   1.759   |  0.377  
 4 / 10 ||    0.869   |   0.723   ||   1.737   |  0.434  
 5 / 10 ||    0.677   |   0.760   ||   1.748   |  0.396  
 6 / 10 ||    0.511   |   0.841   ||   2.067   |  0.377  
 7 / 10 ||    0.436   |   0.872   ||   1.895   |  0.396  
 8 / 10 ||    0.347   |   0.905   ||   2.023   |  0.358  
 9 / 10 ||    0.297   |   0.919   ||   2.088   |  0.377  
 10/ 10 ||    0.236   |   0.944   ||   2.327   |  0.321  
Dataloader from ('['data\\base_texts']') created with 45 batches in 5.0 seconds.
Dataloader from ('['data\\eval_data']') created with 11 batches in 1.4 seconds.
Tr

KeyboardInterrupt: 

In [None]:
import os
import pickle

results_folder = "results"
output_file = os.path.join(results_folder, "eval_losses_sourcetexts_chunksize.pkl")

# Save the dictionary to the file
with open(output_file, 'wb') as file:
    pickle.dump(eval_losses, file)
print(f"Dictionary saved to {output_file}")

Dictionary saved to results\eval_losses_chunksize.pkl
