In [4]:
%load_ext autoreload
%autoreload 2
import torch
import numpy as np
import pandas as pd
from omegaconf import OmegaConf
from pathlib import Path
from pprint import pprint
# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
# mycode
from repo import REPO
import sys
sys.path.append('..')
from ml_utils.output_loader.result_loader import SweepResult, JobResult
from ml_utils.output_loader import create_job_output_loader
from ml_utils.output_loader.plot import plot_sweep_summary, plot_data_log_values

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Ablations on RWKV

These experiments should answer the question: Apart from Time-Mixing (i.e. the wkv kernel), what are the other factors that contribute to the performance of the RWKV model?

### Vary num blocks

In [5]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: dragonfly
  gpu_ids: [1]
  runs_per_gpu: 1
  use_cuda_visible_devices: True
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      - run_handler
      notes: null
      group: ${config.experiment_data.experiment_tag}
      job_type: run_handler

seeds: [0]

sweep:
  type: grid
  axes: 
  - parameter: model.kwargs.num_blocks
    vals: [2, 12]
    
start_num: 0
config:
  experiment_data:
    entity: fslgroup
    project_name: rwkv
    experiment_tag: 'ablations'
    experiment_type: num_blocks
    experiment_name: vary_num_blocks
    experiment_dir: null
    experiment_notes: 
    seed: 0
    gpu_id: 0
   
  model:
    name: rwkv
    kwargs:
      embedding_dim: 512
      attention_dim: 512
      ffn_dim: 2048
      num_blocks: 6
      vocab_size: 6064 # will be set by data
      context_len: 512
      wkv_config: # set to null to use plain torch
        T_max: 1024
        float_mode: '16'
        device: 'cuda:0'
  
  trainer:
    n_epochs: 5 #default: 500
    val_every: 5
    save_every: 5 
    batch_size: 12
    optimizer:
      name: Adam
      kwargs:
        lr: 0.0008

  data:
    name: enwik8 
    kwargs: 
      batch_size: 12
      datafile: '/system/user/beck/pwbeck/projects/rwkv/RWKV-LM/data/enwik8'
      context_length: 512
"""
cfg = OmegaConf.create(config_yaml)

In [6]:
run_command = REPO.create_experiment(cfg, override=False)
print(run_command)

python run_sweep.py --config-name vary_num_blocks.yaml


In [None]:
sweepr = REPO.get_output_loader(cfg)
print(sweepr)

### Vary activation functions TODO

In [None]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: dragonfly
  gpu_ids: [1]
  runs_per_gpu: 1
  use_cuda_visible_devices: True
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      - run_handler
      notes: null
      group: ${config.experiment_data.experiment_tag}
      job_type: run_handler

seeds: [0]

sweep:
  type: grid
  axes: 
  - parameter: model.kwargs.num_blocks
    vals: [2, 12]
    
start_num: 0
config:
  experiment_data:
    entity: fslgroup
    project_name: rwkv
    experiment_tag: 'ablations'
    experiment_type: num_blocks
    experiment_name: vary_num_blocks
    experiment_dir: null
    experiment_notes: 
    seed: 0
    gpu_id: 0
   
  model:
    name: rwkv
    kwargs:
      embedding_dim: 512
      attention_dim: 512
      ffn_dim: 2048
      num_blocks: 6
      vocab_size: 6064 # will be set by data
      context_len: 512
      wkv_config: # set to null to use plain torch
        T_max: 1024
        float_mode: '16'
        device: 'cuda:0'
  
  trainer:
    n_epochs: 5 #default: 500
    val_every: 5
    save_every: 5 
    batch_size: 12
    optimizer:
      name: Adam
      kwargs:
        lr: 0.0008

  data:
    name: enwik8 
    kwargs: 
      batch_size: 12
      datafile: '/system/user/beck/pwbeck/projects/rwkv/RWKV-LM/data/enwik8'
      context_length: 512
"""
cfg = OmegaConf.create(config_yaml)

In [None]:
run_command = REPO.create_experiment(cfg, override=False)
print(run_command)

python run_sweep.py --config-name vary_num_blocks.yaml


In [None]:
sweepr = REPO.get_output_loader(cfg)
print(sweepr)

### Vary init TODO

In [None]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: dragonfly
  gpu_ids: [1]
  runs_per_gpu: 1
  use_cuda_visible_devices: True
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      - run_handler
      notes: null
      group: ${config.experiment_data.experiment_tag}
      job_type: run_handler

seeds: [0]

sweep:
  type: grid
  axes: 
  - parameter: model.kwargs.num_blocks
    vals: [2, 12]
    
start_num: 0
config:
  experiment_data:
    entity: fslgroup
    project_name: rwkv
    experiment_tag: 'ablations'
    experiment_type: num_blocks
    experiment_name: vary_num_blocks
    experiment_dir: null
    experiment_notes: 
    seed: 0
    gpu_id: 0
   
  model:
    name: rwkv
    kwargs:
      embedding_dim: 512
      attention_dim: 512
      ffn_dim: 2048
      num_blocks: 6
      vocab_size: 6064 # will be set by data
      context_len: 512
      wkv_config: # set to null to use plain torch
        T_max: 1024
        float_mode: '16'
        device: 'cuda:0'
  
  trainer:
    n_epochs: 5 #default: 500
    val_every: 5
    save_every: 5 
    batch_size: 12
    optimizer:
      name: Adam
      kwargs:
        lr: 0.0008

  data:
    name: enwik8 
    kwargs: 
      batch_size: 12
      datafile: '/system/user/beck/pwbeck/projects/rwkv/RWKV-LM/data/enwik8'
      context_length: 512
"""
cfg = OmegaConf.create(config_yaml)

In [None]:
run_command = REPO.create_experiment(cfg, override=False)
print(run_command)

python run_sweep.py --config-name vary_num_blocks.yaml


In [None]:
sweepr = REPO.get_output_loader(cfg)
print(sweepr)