In [5]:
from loguru import logger
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset
from pathlib import Path
import yaml
import pickle
from typing import Dict, Any
from torch.utils.tensorboard import SummaryWriter

from data.load_and_preprocess import load_raw_data, create_exploded_df, create_sequences, create_vocab
from data.datasets import AppSequenceDataset, PreloadedDataset
from models.transformer_models import ShallowTransformer, ShallowTransformerWithAttention
from training.iterations import train_epoch, evaluate

from utils import load_config, safe_load_pickle, safe_save_pickle




In [6]:

logger.info("Loading and validating cfguration")
cfg_path = Path("config/train/default.yaml")
cfg = load_config(cfg_path)

logger.info("Converting path strings to Path objects")
data_paths = {k: Path(v) for k, v in cfg['data'].items()}

# Load raw data if needed
if 'all_mappings' in globals():
    logger.info("Using existing mappings from notebook")
else:
    df_day_point, df_app_to_class = load_raw_data(
        data_paths['raw_data_path'],
        data_paths['app_mappings_path']
    )

# Try to load cached exploded_df
exploded_df = safe_load_pickle(data_paths['exploded_df_path'])
if exploded_df is None:
    if 'df_day_point' in globals():
        logger.info("Creating exploded_df from existing df_day_point")
        exploded_df = create_exploded_df(df_day_point)
    else:
        logger.info("Loading raw data and creating exploded_df")
        df_day_point, _ = load_raw_data(
            data_paths['raw_data_path'],
            data_paths['app_mappings_path']
        )
        exploded_df = create_exploded_df(df_day_point)
    safe_save_pickle(exploded_df, data_paths['exploded_df_path'])

# Try to load cached sequences
sequences = safe_load_pickle(data_paths['sequences_path'])
if sequences is None:
    logger.info("Creating sequences from exploded_df")
    seq_length = cfg.get('model', {}).get('seq_length', 64)
    sequences = create_sequences(exploded_df, seq_length)
    safe_save_pickle(sequences, data_paths['sequences_path'])

df_sequences = pd.DataFrame(sequences)
seq_length = df_sequences['apps'].apply(len).max()
logger.info(f"Sequence length: {seq_length}")

# Try to load cached vocabulary
app_to_idx = safe_load_pickle(data_paths['vocab_path'])
if app_to_idx is None:
    app_to_idx = create_vocab(sequences)
    safe_save_pickle(app_to_idx, data_paths['vocab_path'])
vocab_size = len(app_to_idx)
logger.info(f"Vocabulary size: {vocab_size}")
    
    


[32m2024-11-17 19:13:28.323[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading and validating cfguration[0m
[32m2024-11-17 19:13:28.330[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mConverting path strings to Path objects[0m
[32m2024-11-17 19:13:38.820[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m42[0m - [1mSequence length: 64[0m
[32m2024-11-17 19:13:38.822[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m50[0m - [1mVocabulary size: 634[0m


In [7]:

device = torch.device(cfg.get('training', {}).get('device') if torch.cuda.is_available() else 'cpu')
logger.info(f"Using device: {device}")

logger.info("Creating datasets and splitting into train/val sets")
dataset = AppSequenceDataset(sequences, app_to_idx)

dataset = AppSequenceDataset(sequences, app_to_idx, sequence_length=seq_length, mask_prob=cfg.get('training', {}).get('mask_prob', 0.15))
logger.info(f"Dataset for self-supervised training created with mask probability: {cfg.get('training', {}).get('mask_prob', 0.15)}")

train_emp_ids, val_emp_ids = train_test_split(
    df_sequences['employeeId'].unique(), 
    test_size=cfg.get('training', {}).get('test_size', 0.2), 
    random_state=cfg.get('training', {}).get('random_seed', 42)
)

train_idx = df_sequences.merge(pd.Series(train_emp_ids, name='employeeId'), on='employeeId', how='inner').index.tolist()
val_idx = df_sequences.merge(pd.Series(val_emp_ids, name='employeeId'), on='employeeId', how='inner').index.tolist()

train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)

if cfg.get('training', {}).get('preload_dataset', False):
    logger.info("Preloading dataset to device")
    train_dataset = PreloadedDataset(train_dataset, device)
    val_dataset = PreloadedDataset(val_dataset, device)





logger.info("Creating data loaders")
train_loader = DataLoader(
    train_dataset, 
    batch_size=cfg.get('model', {}).get('batch_size'), 
    shuffle=True
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=cfg.get('model', {}).get('batch_size')
)






[32m2024-11-17 19:13:38.837[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mUsing device: cuda[0m
[32m2024-11-17 19:13:38.838[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mCreating datasets and splitting into train/val sets[0m
[32m2024-11-17 19:13:38.840[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mDataset for self-supervised training created with mask probability: 0.15[0m
[32m2024-11-17 19:13:39.134[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mCreating data loaders[0m


In [8]:
from torch import nn

logger.info("Initializing model")
# model = ShallowTransformerW(
#     vocab_size=vocab_size + 1,
#     d_model=cfg.get('model', {}).get('d_model', 64),
#     nhead=cfg.get('model', {}).get('nhead', 4),
#     seq_length=seq_length,
# ).to(device)
model = ShallowTransformerWithAttention(
    vocab_size=vocab_size + 1, # + 1 for MASK
    d_model=cfg.get('model', {}).get('d_model', 64),
    nhead=cfg.get('model', {}).get('nhead', 4),
    seq_length=seq_length,
    n_layers=cfg.get('model', {}).get('num_encoder_layers', 3)
).to(device)
logger.info(f"{model} initialized on device: {device}")
# Training setup
logger.info("Initializing training components")
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=float(cfg.get('training', {}).get('lr', 1e-4))
)



[32m2024-11-17 19:13:39.991[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mInitializing model[0m
[32m2024-11-17 19:13:40.193[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mShallowTransformerWithAttention(
  (app_embeddings): Embedding(635, 64)
  (layers): ModuleList(
    (0-2): 3 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
      )
      (linear1): Linear(in_features=64, out_features=128, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=128, out_features=64, bias=True)
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (app_predictor): Linear(in_features=64, out_features=6

In [9]:
# Add TensorBoard writer setup after model initialization
log_dir = Path("runs") / f"transformer_{seq_length}_{cfg['training']['lr']}"
writer = SummaryWriter(log_dir)
logger.info(f"TensorBoard logs will be saved to {log_dir}")

logger.info("Starting training loop")
n_epochs = cfg.get('model', {}).get('num_epochs', 10)

for epoch in range(n_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, accuracy = evaluate(model, val_loader, criterion, device)
    
    # Log metrics to TensorBoard
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Loss/validation', val_loss, epoch)
    writer.add_scalar('Accuracy/validation', accuracy, epoch)
    
    # Add learning rate tracking
    writer.add_scalar('Learning_rate', optimizer.param_groups[0]['lr'], epoch)
    
    # Optional: Log model parameters histograms
    for name, param in model.named_parameters():
        writer.add_histogram(f'Parameters/{name}', param.data, epoch)

    logger.info(f"Epoch {epoch+1}/{n_epochs}")
    logger.info(f"Train Loss: {train_loss:.4f}")
    logger.info(f"Val Loss: {val_loss:.4f}")
    logger.info(f"Accuracy: {accuracy:.4f}")

# Close the writer at the end
writer.close()

if cfg.get('training', {}).get('save_model_path', None):
    logger.info("Saving model checkpoint")
    model_path = Path(cfg.get('training', {}).get('save_model_path'))
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'app_to_idx': app_to_idx,
    }, model_path)


[32m2024-11-17 19:13:44.211[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mTensorBoard logs will be saved to runs\transformer_64_1e-4[0m
[32m2024-11-17 19:13:44.212[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mStarting training loop[0m
Training: 100%|██████████| 1534/1534 [00:38<00:00, 40.10it/s]
Evaluating: 100%|██████████| 404/404 [00:09<00:00, 42.69it/s]
[32m2024-11-17 19:14:32.046[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1mEpoch 1/10[0m
[32m2024-11-17 19:14:32.051[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1mTrain Loss: 5.3691[0m
[32m2024-11-17 19:14:32.052[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mVal Loss: 4.8894[0m
[32m2024-11-17 19:14:32.054[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mAccuracy: 0.0528[0m
Training: 100%|██████████| 1534/1534 [00:37<00:00, 40.84it/s

# Load model and analyze