# Introduction

In [None]:
%cd /fsx/awesome/DPT/
# Changing directories to /fsx/awesome/DPT/ so that we can run the code from the DPT/deep-thinking/train_model.py file

## Writing the custom dataloader for `addition` task

RENTRY LINK: https://rentry.co/9966g, WHICH HAS MY VERSION FOR FFNET_1D

In [None]:
%%writefile ./deep-thinking/deepthinking/models/dt_net_1d.py
""" dt_net_1d.py
    DeepThinking 1D convolutional neural network.

    Collaboratively developed
    by Avi Schwarzschild, Eitan Borgnia,
    Arpit Bansal, and Zeyad Emam.

    Developed for DeepThinking project
    October 2021
"""
import web_pdb as pdb
import torch
import math
import torch.nn.functional as F

from torch import nn
from .blocks import BasicBlock1D as BasicBlock
from .alibi import OptimizedALiBiMultiHeadAttention as ALiBiMHSA, VanillaALiBi
from .rope import RoPE_MHA
from .flash_mha import FlashMultiHeadAttention

# Enabling SDP backend
#torch.backends.cuda.enable_flash_sdp(enabled=True)
#print(f'\n{chr(0x26A1)*20}\nFlash Attention status: {torch.backends.cuda.flash_sdp_enabled()}\n{chr(0x26A1)*20}\n')

# Ignore statemenst for pylint:
#     Too many branches (R0912), Too many statements (R0915), No member (E1101),
#     Not callable (E1102), Invalid name (C0103), No exception (W0702)
# pylint: disable=R0912, R0915, E1101, E1102, C0103, W0702, R0914
class NewGELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class AttentionBlock1D(nn.Module):
    """Basic MHSA residual block class for DeepThinking """
    
    def __init__(self, drop_rate:int, width:int):
        super().__init__()
        self.width = width
        self.activation = NewGELU()

        self.attn_head = torch.nn.MultiheadAttention(self.width, self.width//32, bias=True, batch_first=True, dropout=0.05)
        self.linear1 = nn.Linear(self.width, self.width)

        self.ln1 = nn.LayerNorm(self.width)
        self.ln2 = nn.LayerNorm(self.width)

        self.mlp = nn.Sequential(
            nn.Linear(self.width, self.width),
            self.activation,
            nn.Linear(self.width, self.width),
            nn.Dropout(drop_rate),
        )

    def forward(self, x):
        x = self.ln1(x)
        x = x + self.attn_head(x, x, x, need_weights=False)[0]
        x = self.ln2(x)
        x = x + self.mlp(x)

        return self.activation(x)

class DTNet1D(nn.Module):
    """DeepThinking 1D Network model class"""

    def __init__(self, block, num_blocks, width, recall, group_norm=False, **kwargs):
        super().__init__()

        self.width = int(width) # width of the network layers
        self.bottleneck = self.width // 2 # bottleneck width
        self.recall = recall
        self.SEQLEN = 16 # length of the input sequence
        drop_rate = 0.1 # dropout rate

        self.reshape_layer = nn.Linear(self.width, self.bottleneck) # downsampling layer
        self.embed_layer = nn.Embedding(13, self.bottleneck, padding_idx=11) # embedding layer for the input sequence

        proj_linear = nn.Linear(self.bottleneck, self.bottleneck)
        head_linear = nn.Linear(self.bottleneck, 13)
        
        # Handling the recurrence 
        if self.recall:
            recur_layers = [self.reshape_layer, NewGELU()]
        else:
            recur_layers = []

        for i in range(num_blocks):
            recur_layers.insert(0, AttentionBlock1D(drop_rate, width)) # add attention blocks to the beginning of the list

        self.projection = nn.Sequential(proj_linear, NewGELU())
        self.recur_block = nn.Sequential(*recur_layers)
        self.head = nn.Sequential(head_linear, NewGELU())
    
    @torch.no_grad()
    def positional_encoding(self, max_seq_len, d_model, device='cuda:0'):
        '''
        Generates the positional encoding for the input sequence
        of shape (batch_size, max_seq_len, d_model) which would be added
        to the sequence embeddings.
        '''
        pe = torch.zeros(max_seq_len, d_model, device=device)

        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        
        return pe

    def forward(self, x, iters_to_do, interim_thought=None, **kwargs):
        # x -> (batch, 16)
        x = self.embed_layer(x) + self.positional_encoding(self.SEQLEN, self.bottleneck, device=x.device)
        initial_thought = self.projection(x)

        if interim_thought is None:
            interim_thought = initial_thought

        # X -> (32, 1, 96), 32 is batch_size/#GPUs
        all_outputs = torch.zeros((x.size(0), iters_to_do, self.SEQLEN, 13), device=x.device)

        for i in range(iters_to_do):
            if self.recall:
                x = x.unsqueeze(-1) if x.dim() == 2 else x # (batch, 16) -> (batch, 16, 1) if needed
                interim_thought = torch.cat([interim_thought, x], 2)

            interim_thought = self.recur_block(interim_thought)
            out = self.head(interim_thought)
            all_outputs[:, i] = out

        if self.training:
            return out, interim_thought

        return all_outputs


def dt_net_1d(width, **kwargs):
    return DTNet1D(BasicBlock, 1, width, recall=False)


def dt_net_recall_1d(width, **kwargs):
    return DTNet1D(BasicBlock, 1, width, recall=True)


def dt_net_gn_1d(width, **kwargs):
    return DTNet1D(BasicBlock, 1, width, recall=False, group_norm=True)


def dt_net_recall_gn_1d(width, **kwargs):
    return DTNet1D(BasicBlock, 1, width, recall=True, group_norm=True)

In [2]:
%%writefile ./deep-thinking/deepthinking/utils/addition_data.py
"""
@author: neel04
adapted from the Deep Thinking repo. New Version
"""

import torch
import random
import numpy as np
from torch.utils.data import Dataset, DataLoader

class ArithmeticDataset(Dataset):
  def __init__(self, mode, samples, seqlen, bits=None):
    '''
    Setting up the dataset to produce an arithmetic datast
    with whatever operation
    '''
    assert mode in ["train", "val", "test"]

    self.mode = mode
    self.samples = samples
    self.seqlen = seqlen
    self.pad_token = 9

    if self.mode == "train":
        self.upper_b = 13
        self.lower_b = 1
    elif self.mode == "val":
        self.upper_b = 14
        self.lower_b = 13
    else:
        self.upper_b = 16
        self.lower_b = 14

  def __len__(self):
    return self.samples

  def get_prefix_sum(self, x):
    cum_sum = np.cumsum(x)
    return [cum_sum[i] % 2 for i in range(len(cum_sum))]

  def pad_sequence(self, seq, max_len):
    return seq[:max_len] + [self.pad_token] * (max_len - len(seq))
  
  def decode(self, x):
      return "".join(str(i) for i in x if i != self.pad_token)

  def __getitem__(self, idx):
    seq_len = random.randint(self.lower_b, self.upper_b) # random length of the sequence
    seq = [random.randint(0, 1) for i in range(seq_len)]

    prefix_sum = self.get_prefix_sum(seq)
    padded_seq = self.pad_sequence(seq, self.seqlen)
    padded_prefix_sum = self.pad_sequence(prefix_sum, self.seqlen)

    return torch.tensor(padded_seq), torch.tensor(padded_prefix_sum)


def prepare_addition_loader(train_batch_size, test_batch_size, train_data, test_data, shuffle=False):
    # We ignore the train_data and test_data rather than removing for compatibility reasons
    
    train_dataset = ArithmeticDataset(mode='train', samples=50_000, seqlen=16, bits=6)
    val_dataset = ArithmeticDataset(mode='val', samples=50_000, seqlen=16, bits=6)
    test_dataset = ArithmeticDataset(mode='test', samples=5_000, seqlen=16, bits=6)

    trainloader = DataLoader(train_dataset,
                             num_workers=2,
                             batch_size=train_batch_size,
                             shuffle=shuffle,
                             drop_last=True,
                             pin_memory=True,
                             prefetch_factor=8)

    valloader = DataLoader(val_dataset,
                             num_workers=2,
                             batch_size=test_batch_size,
                             shuffle=False,
                             drop_last=False,
                             pin_memory=True,
                             persistent_workers=True,
                             prefetch_factor=2)

    testloader = DataLoader(test_dataset,
                             num_workers=2,
                             batch_size=test_batch_size,
                             shuffle=False,
                             drop_last=False,
                             pin_memory=True,
                             persistent_workers=True,
                             prefetch_factor=4)

    loaders = {"train": trainloader, "test": testloader, "val": valloader}

    return loaders
    print(f'\nAddition dataloaders have been succesfully created!')

Writing ./deep-thinking/deepthinking/utils/addition_data.py


## Setting up correct imports, inserting dataloader

In [3]:
%%writefile ./deep-thinking/deepthinking/utils/tools.py
""" tools.py
    Utility functions that are common to all tasks

    Collaboratively developed
    by Avi Schwarzschild, Eitan Borgnia,
    Arpit Bansal, and Zeyad Emam.

    Developed for DeepThinking project
    October 2021
"""
import os
import logging
import random
from datetime import datetime

import torch
from icecream import ic
from torch.optim import SGD, Adam, AdamW
from torch.optim.lr_scheduler import MultiStepLR, CosineAnnealingLR
from .lion_opt import Lion, AdamOnLion 

import deepthinking.models as models
from .mazes_data import prepare_maze_loader
from .prefix_sums_data import prepare_prefix_loader
from .chess_data import prepare_chess_loader #ADDED NEW
from .addition_data import prepare_addition_loader
from .. import adjectives, names

from .warmup import ExponentialWarmup, LinearWarmup

# Ignore statements for pylint:
#     Too many branches (R0912), Too many statements (R0915), No member (E1101),
#     Not callable (E1102), Invalid name (C0103), No exception (W0702),
#     Too many local variables (R0914), Missing docstring (C0116, C0115).
# pylint: disable=R0912, R0915, E1101, E1102, C0103, W0702, R0914, C0116, C0115


def generate_run_id():
    hashstr = f"{adjectives[random.randint(0, len(adjectives))]}-{names[random.randint(0, len(names))]}"
    return hashstr


def get_dataloaders(problem_args):
    if problem_args.name == "prefix_sums":
        return prepare_prefix_loader(train_batch_size=problem_args.hyp.train_batch_size,
                                     test_batch_size=problem_args.hyp.test_batch_size,
                                     train_data=problem_args.train_data,
                                     test_data=problem_args.test_data)
    elif problem_args.name == "mazes":
        return prepare_maze_loader(train_batch_size=problem_args.hyp.train_batch_size,
                                   test_batch_size=problem_args.hyp.test_batch_size,
                                   train_data=problem_args.train_data,
                                   test_data=problem_args.test_data)
    elif problem_args.name == "chess":
        return prepare_chess_loader(train_batch_size=problem_args.hyp.train_batch_size,
                                    test_batch_size=problem_args.hyp.test_batch_size,
                                    train_data=problem_args.train_data,
                                    test_data=problem_args.test_data)
    elif problem_args.name == "addition":
        return prepare_addition_loader(train_batch_size=problem_args.hyp.train_batch_size,
                                    test_batch_size=problem_args.hyp.test_batch_size,
                                    train_data=problem_args.train_data,
                                    test_data=problem_args.test_data)
    else:
        raise ValueError(f"Invalid problem spec. {problem_args.name}")


def get_model(model, width, max_iters, in_channels=3):
    model = model.lower()
    net = getattr(models, model)(width=width, in_channels=in_channels, max_iters=max_iters)
    print(net,'\n\n')
    return net


def get_optimizer(optim_args, model_args, net, state_dict):
    optimizer_name = optim_args.optimizer.lower()
    epochs = optim_args.epochs
    lr = optim_args.lr
    lr_decay = optim_args.lr_decay
    lr_schedule = optim_args.lr_schedule
    lr_factor = optim_args.lr_factor
    warmup_period = optim_args.warmup_period

    if optim_args.lr_throttle:
        # Reducing the lr here for the recurrent layers helps with stability,
        # To date (July 21, 2021), we may only need this for maze models.
        base_params = [p for n, p in net.named_parameters() if "recur" not in n]
        recur_params = [p for n, p in net.named_parameters() if "recur" in n]
        iters = model_args.max_iters
        all_params = [{"params": base_params}, {"params": recur_params, "lr": lr / iters}]
    else:
        base_params = [p for n, p in net.named_parameters()]
        recur_params = []
        iters = 1
        all_params = [{"params": base_params}]

    if optimizer_name == "sgd":
        optimizer = SGD(all_params, lr=lr, weight_decay=2e-3, momentum=0.9)
    elif optimizer_name == "adam":
        optimizer = Adam(all_params, lr=lr, weight_decay=2e-3)
    elif optimizer_name == "adamw":
        optimizer = AdamW(all_params, lr=lr, weight_decay=2e-3)
    elif optimizer_name == "lion":
        optimizer = Lion(all_params, lr=lr, weight_decay=7e-3, betas=(0.9, 0.99))
    elif optimizer_name == "adam_on_lion":
        optimizer = AdamOnLion(all_params, lr=lr, weight_decay=2e-3, betas=(0.9, 0.99))
    else:
        raise ValueError(f"{ic.format()}: Optimizer choise of {optimizer_name} not yet implmented.")

    if state_dict is not None:
        optimizer.load_state_dict(state_dict)
        optimizer.param_groups[0]["capturable"] = True # make optimizer capturable=True
        warmup_scheduler = ExponentialWarmup(optimizer, warmup_period=0)
        # warmup_scheduler = LinearWarmup(optimizer, warmup_period=0)
    else:
        warmup_scheduler = ExponentialWarmup(optimizer, warmup_period=warmup_period)
        # warmup_scheduler = LinearWarmup(optimizer, warmup_period=warmup_period)

    if lr_decay.lower() == "step":
        lr_scheduler = MultiStepLR(optimizer, milestones=lr_schedule,
                                   gamma=lr_factor, last_epoch=-1)
    elif lr_decay.lower() == "cosine":
        lr_scheduler = CosineAnnealingLR(optimizer, epochs, eta_min=0, last_epoch=-1, verbose=False)
    else:
        raise ValueError(f"{ic.format()}: Learning rate decay style {lr_decay} not yet implemented.")

    return optimizer, warmup_scheduler, lr_scheduler


def load_model_from_checkpoint(problem, model_args, device, accelerator):
    model = model_args.model
    model_path = model_args.model_path
    width = model_args.width
    max_iters = model_args.max_iters
    epoch = 0
    optimizer = None
    new_state_dict = {}

    in_channels = 3
    if problem == "chess":
        in_channels = 12
    elif problem == 'addition':
        in_channels = 1

    net = get_model(model, width, in_channels=in_channels, max_iters=max_iters)
    net = net.to(device)
    if device == "cuda":
        net = net
    
    if model_path is not None and os.path.exists(model_path):
        logging.info(f"\n{'$'*50}\nLoading model from checkpoint {model_path}...\n{'$'*50}")
        state_dict = torch.load(model_path, map_location=device)

        # check if keys are prefixed with "module."
        new_state_dict = state_dict.copy()

        for key in list(new_state_dict["net"].keys()):
            new_key = key.replace('_orig_mod.', '') # remove _orig_mod. prefix
            new_state_dict["net"][new_key] = state_dict['net'][key]
            # remove old key
            del new_state_dict["net"][key]
        
        # Now load fixed state_dict
        net.load_state_dict(new_state_dict["net"])
        epoch = new_state_dict["epoch"] + 1
        optimizer = new_state_dict["optimizer"]
        accelerator.load_state(f"/fsx/DPT/outputs/{model_path}")

    return net, epoch, optimizer, accelerator


def now():
    return datetime.now().strftime("%Y%m%d %H:%M:%S")

Overwriting /kaggle/working/deep-thinking/deepthinking/utils/tools.py


## Config `YAML`

Hyperparameters for the model

In [4]:
%%writefile ./deep-thinking/config/problem/hyp/addition_default.yaml
alpha: 1
clip: 2
epochs: 175
lr: 9e-5 # 5e-4 does get to 98% elemennwise accuracy in 100 epochs
lr_decay: cosine
lr_factor: 0.1
lr_schedule: #  CosineAnnealingLR Doesn't use this param
  - 2
lr_throttle: False
optimizer: lion
save_period: -1
test_batch_size: 768
test_mode: default
train_batch_size: 768
train_mode: progressive
val_period: 20
warmup_period: 8 # We use Cosine warmup for the first 5 epochs, low warmup for DDP

Writing /kaggle/working/deep-thinking/config/problem/hyp/addition_default.yaml


- [ ] Look about using difference loss functions

In [5]:
%%writefile ./deep-thinking/config/problem/addition.yaml
defaults:
  - hyp: addition_default
  - model: dt_net_recall_1d

name: addition
test_data: 5_000
train_data: 50_000

model:
  model_path:
  width: 256
  max_iters: 15
  test_iterations:
    low: 15
    high: 25

Writing /kaggle/working/deep-thinking/config/problem/addition.yaml


## Executing the training for Arithmetic 🚀

Shape -> `(24, 20)`
- w denotes the width of the model
- Prefix sum used width = 400, so bump 128 → 400
- iterations can be pushed to [50,100]
- Epochs → 100
- Grid search for α ∈ [0,1]
- [x]  Try L1/L2 loss

`detnet_1d.py` has been modified
`training.py` has been modified
`testing.py` has been modified

In [None]:
from random import randrange
import torch
import os

%env HYDRA_FULL_ERROR=1
%cd /fsx/awesome/DPT/deep-thinking

# generate random port b/w 20000-30000
port = randrange(20_000, 30_000)
rdvz_id = randrange(100, 999)
#!python3 train_model.py problem=addition name=addition_run
# we have to launch above script with torchrun on a single host, with 8 GPUs
%env OMP_NUM_THREADS=2
%env PORT=$port
%env RDVZ_ID=$rdvz_id
#!torchrun --nproc_per_node=1 --nnodes=1 --rdzv_id=$RDVZ_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:$PORT train_model.py problem=addition name=addition_run
# Using Huggigface accelerate with DDP and 1 GPU
#!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 train_model.py problem=addition name=addition_run

n_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {n_gpus}")
tpu_available = 'COLAB_TPU_ADDR' in os.environ

%env NUM_GPUS=$n_gpus

if n_gpus > 0:
    !accelerate launch --config_file /fsx/awesome/DPT/configs/acc_config.yaml --num_processes=$NUM_GPUS train_model.py problem=addition name=addition_run
elif tpu_available:
    print("TPU is available")
    !accelerate launch --config_file /fsx/awesome/DPT/configs/acc_tpu_config.yaml train_model.py problem=addition name=addition_run
else:
    print("No GPU or TPU available")
    !accelerate launch --config_file /fsx/awesome/DPT/configs/acc_cpu_config.yaml train_model.py problem=addition name=addition_run
