# Introduction

In [None]:
%cd /fsx/awesome/DPT/
# Changing directories to /fsx/awesome/DPT/ so that we can run the code from the DPT/deep-thinking/train_model.py file

## Writing the custom dataloader for `reverse` list task

In [None]:
%%writefile ./deep-thinking/deepthinking/models/dt_net_1d.py
""" dt_net_1d.py
    DeepThinking 1D convolutional neural network.

    Collaboratively developed
    by Avi Schwarzschild, Eitan Borgnia,
    Arpit Bansal, and Zeyad Emam.

    Developed for DeepThinking project
    October 2021
"""
import web_pdb as pdb
import torch
import math
import torch.nn.functional as F

from torch import nn

# Ignore statemenst for pylint:
#     Too many branches (R0912), Too many statements (R0915), No member (E1101),
#     Not callable (E1102), Invalid name (C0103), No exception (W0702)
# pylint: disable=R0912, R0915, E1101, E1102, C0103, W0702, R0914
class NewGELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

def next_power(x: int):
    i = 1
    while i < x: i *= 2 # power of 2 less than x
    return i

class LiteAttention(nn.Module):
    '''
    Compute a data dependent vector of attention weights
    which is hadamard multiplied with the actual input to produce a weighted output. 
    Use Softmax to normalize the attention weights.
    '''
    def __init__(self, dim: int, s_dim: int = 1):
        super().__init__()
        self.attention_weights = nn.Linear(dim, dim, bias=True)
        self.gate = nn.Softmax(dim=s_dim)
    
    def forward(self, x):
        attention_weights = self.gate(self.attention_weights(x))
        return x * attention_weights

class AttentionBlock(nn.Module):
    """Basic MHSA residual block class for DeepThinking """
    
    def __init__(self, drop_rate:float, width: int):
        super().__init__()
        self.activation = NewGELU()
        self.input_dim = width

        self.attn_gate = LiteAttention(self.input_dim, s_dim=1)
        #self.attn_gate = torch.nn.MultiheadAttention(self.input_dim, next_power(self.input_dim // 64), bias=True, dropout=drop_rate) # width // 2 = bottleneck
        self.ln1 = nn.LayerNorm(self.input_dim)
        self.ln2 = nn.LayerNorm(self.input_dim)

        self.mlp = nn.Sequential(
            nn.Linear(self.input_dim, self.input_dim),
            self.activation,
            nn.Linear(self.input_dim, self.input_dim),
            nn.Dropout(drop_rate),
        )

    def forward(self, x):
        x = self.ln1(x)
        x = x + self.attn_gate(x)
        #x = x + self.attn_gate(x,x,x, need_weights=False)[0]
        x = self.ln2(x)
        x = x + self.mlp(x)

        return self.activation(x)

class RecurrentModule(nn.Module):
    """Main Module for holding recurrent modules"""

    def __init__(self, num_blocks: int, drop_rate: float, width: int, bottleneck: int, recall: bool = True):
        super(RecurrentModule, self).__init__()
        self.recall: bool = recall
        # Define the layers
        self.gelu = NewGELU()
        self.reshape_layer = nn.Linear(width, bottleneck) # downsampling layer

        self.attention_blocks = nn.Sequential(*[
            AttentionBlock(drop_rate, width)
            for _ in range(num_blocks)
        ])

    def forward(self, x: torch.Tensor):
        # Add attention blocks
        #for attention_block in self.attention_blocks:
        x = self.attention_blocks(x)
        
        # Handling the recurrence by downsampling
        if self.recall:
            x = self.gelu(self.reshape_layer(x))

        return x

class OutputModule(nn.Module):
    def __init__(self, bottleneck: int, tgt_vocab_size: int):
        super().__init__()
        self.head = nn.Sequential(nn.Linear(bottleneck, tgt_vocab_size))
    
    def forward(self, x: torch.Tensor):
        # x -> (batch_size, SEQLEN, bottleneck) | target -> (batch_size, SEQLEN, tgt_vocab_size)
        x = self.head(x) # x -> (batch_size, SEQLEN, tgt_vocab_size)
        return x

class DTNet1D(nn.Module):
    """DeepThinking 1D Network model class"""

    def __init__(self, num_blocks, width, recall, **kwargs):
        super().__init__()
        self.recall: bool = recall
        self.width = int(width) # width of the network layers
        self.bottleneck = width // 2 # bottleneck width

        self.vocab_size = 9
        self.tgt_vocab_size = 3 # usually equal to vocab_size
        self.SEQLEN = 32 # length of the input sequence
        self.drop_rate = 0.1 # dropout rate

        self.embed_layer = nn.Embedding(self.vocab_size, self.bottleneck, padding_idx=8) # embedding layer for the input sequence
        self.reshape_head = nn.Sequential(
            nn.Linear(self.bottleneck, self.bottleneck),
            NewGELU())

        self.recur_block = RecurrentModule(num_blocks, self.drop_rate, self.width, self.bottleneck) # Main recurrent block

        self.out_head = OutputModule(self.bottleneck, self.tgt_vocab_size) # Output module
        self.pos_enc = self.positional_encoding(self.bottleneck).to(torch.device('cuda:0'))
    
    @torch.no_grad()
    def positional_encoding(self, d_model):
        '''
        Generates the positional encoding for the input sequence
        of shape (batch_size, max_seq_len, d_model) which would be added
        to the sequence embeddings.
        '''
        position = torch.arange(self.SEQLEN, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe = torch.zeros(self.SEQLEN, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        return pe

    def forward(self, x, iters_to_do, interim_thought=None, **kwargs):
        # x is shape: (batch, SEQLEN)
        x = self.embed_layer(x) + self.pos_enc.to(x.device) # (batch, SEQLEN, bottleneck)
        initial_thought = self.reshape_head(x) # (batch, SEQLEN, bottleneck)

        if interim_thought is None:
            interim_thought = initial_thought

        # Only used during testing
        all_outputs = torch.zeros((x.size(0), iters_to_do, self.SEQLEN, self.tgt_vocab_size), device=x.device) if not self.training else None

        # Funneling all branches into a single, recursive set of blocks repeated `iters_to_do` times
        for i in range(iters_to_do):
            if self.recall:
                x = x.unsqueeze(-1) if x.dim() == 2 else x # (batch, 16) -> (batch, 16, 1) if needed
                # interim_thought is shape: (batch, bottleneck // 2) | x is shape: (batch, SEQLEN)
                interim_thought = torch.cat([interim_thought, x], 2) # (batch, SEQLEN, bottleneck // 2 + SEQLEN)

            interim_thought = self.recur_block(interim_thought) # the recursive block, bulk of the network | (batch, SEQLEN, bottleneck)

            if not self.training:
                # During testing, we need out for every iteration to append to all_outputs
                out = self.out_head(interim_thought) # (batch, SEQLEN, tgt_vocab_size)
                all_outputs[:, i] = out # storing intermediate computations for each iteration

        if self.training:
            # During training, we only need output when all the iterations are done, saving compute
            out = self.out_head(interim_thought) # (batch, SEQLEN, tgt_vocab_size)
            return out, interim_thought

        return all_outputs

def dt_net_1d(width, **kwargs):
    return DTNet1D(2, width, recall=False)


def dt_net_recall_1d(width, **kwargs):
    return DTNet1D(2, width, recall=True)


def dt_net_gn_1d(width, **kwargs):
    return DTNet1D(2, width, recall=False)


def dt_net_recall_gn_1d(width, **kwargs):
    return DTNet1D(2, width, recall=True)

In [15]:
%%writefile ./deep-thinking/deepthinking/utils/addition_data.py
"""
@author: neel04
adapted from the Deep Thinking repo. New Version
"""
import torch
import random
import numpy as np
import multiprocessing
from torch.utils.data import Dataset, DataLoader

class bADDataset(Dataset):
  '''
  Generates a dataset of binary addition problems
  '''
  def __init__(self, mode, samples, seqlen):
    assert mode in ["train", "val", "test"]

    self.mode = mode
    self.samples = samples
    self.seqlen = seqlen
    self.tokens = ['0', '1', '', '+', '=', ',', 'not', 'val', '[MASK]']

    self.tok_dict = {token: index for index, token in enumerate(self.tokens)}
    self.inv_tok_dict = {index: token for index, token in enumerate(self.tokens)}

    self.pad_token = self.tok_dict[''] # 2
    self.masking_token = self.tok_dict['[MASK]']

    self.tokenizer = lambda x: [self.tok_dict[token] for token in x]

    if self.mode == "train": # bounds for the length of digits in EACH number @ different stages
        self.upper_b = 10
        self.lower_b = 1
    elif self.mode == "val":
        self.upper_b = 12
        self.lower_b = 10
    else:
        self.upper_b = 15
        self.lower_b = 12

  def __len__(self) -> int:
    return self.samples
  
  def stringify(self, x: list):
    return ''.join(str(i) for i in x)

  def generate_data(self, tokenizer, n_digits: int):
    '''
    Generates a binary addition problem with n_digits number of digits
    '''
    # generate two random numbers with n_digits number of digits
    num1 = random.randint(0, 2**n_digits)
    num2 = random.randint(0, 2**n_digits)

    # convert the numbers to binary and create the sum in binary. Don't pad.
    num1_bin = bin(num1)[2:]
    num2_bin = bin(num2)[2:]
    sum_bin = bin(num1 + num2)[2:]

    # pad the numbers and the sum with zeros on the left side to the maximum length
    max_len = max(len(num1_bin), len(num2_bin), len(sum_bin))
    num1_bin, num2_bin, sum_bin = num1_bin.zfill(max_len), num2_bin.zfill(max_len), sum_bin.zfill(max_len)

    # concatenate the numbers and the sum with the operators
    src_seq = f'{num1_bin}+{num2_bin}'

    return tokenizer(src_seq), tokenizer(sum_bin)

  def pad_sequence(self, seq: torch.Tensor, max_len: int, rndm_num:int):
    if self.mode == "train":
        # pads the sequence with random number of pad tokens on both sides
        return [self.pad_token] * rndm_num + seq + [self.pad_token] * (max_len - len(seq) - rndm_num)
    else:
        # this is the default padding for val and test
        return seq[:max_len] + [self.pad_token] * (max_len - len(seq))

  def decode(self, x: torch.Tensor):
    x = x.view(-1).tolist()

    #if len(set(x)) >= 3: # check to see if its inputs, because outputs is binary in [0, 1]
    return ''.join(self.inv_tok_dict[int(elem)] if elem in self.tok_dict.values() else '' for elem in x).strip()

  def __getitem__(self, idx: int):
    n_digits = np.random.randint(self.lower_b, self.upper_b+1)
    src_seq, tgt_seq = self.generate_data(self.tokenizer, n_digits=n_digits)

    maxlen = max(len(src_seq), len(tgt_seq))
    rndm_num = random.randint(0, self.seqlen - maxlen) if (self.seqlen - maxlen) >= 0 else 0  # set it to 0 to prevent error

    padded_src_seq = torch.Tensor(self.pad_sequence(src_seq, self.seqlen, rndm_num)).long()
    padded_tgt_seq = torch.Tensor(self.pad_sequence(tgt_seq, self.seqlen, rndm_num)).long()

    return padded_src_seq, padded_tgt_seq

def prepare_addition_loader(train_batch_size, test_batch_size, train_data, test_data, shuffle=False):
    # We ignore the train_data and test_data rather than removing for compatibility reasons
    
    train_dataset = bADDataset(mode='train', samples=50_000, seqlen=32)
    val_dataset = bADDataset(mode='val', samples=5_000, seqlen=32)
    test_dataset = bADDataset(mode='test', samples=5_000, seqlen=32)

    num_cores = multiprocessing.cpu_count()

    trainloader = DataLoader(train_dataset,
                             num_workers=num_cores,
                             batch_size=train_batch_size,
                             shuffle=shuffle,
                             drop_last=True,
                             pin_memory=True,
                             prefetch_factor=32)

    valloader = DataLoader(val_dataset,
                             num_workers=num_cores,
                             batch_size=test_batch_size,
                             shuffle=False,
                             drop_last=False,
                             pin_memory=True,
                             persistent_workers=True,
                             prefetch_factor=64)

    testloader = DataLoader(test_dataset,
                             num_workers=num_cores,
                             batch_size=test_batch_size,
                             shuffle=False,
                             drop_last=False,
                             pin_memory=True,
                             persistent_workers=True,
                             prefetch_factor=64)

    loaders = {"train": trainloader, "test": testloader, "val": valloader}

    return loaders
    print(f'\nAddition dataloaders have been succesfully created!')

## Setting up correct imports, inserting dataloader

In [3]:
%%writefile ./deep-thinking/deepthinking/utils/tools.py
""" tools.py
    Utility functions that are common to all tasks

    Collaboratively developed
    by Avi Schwarzschild, Eitan Borgnia,
    Arpit Bansal, and Zeyad Emam.

    Developed for DeepThinking project
    October 2021
"""
import os
import logging
import random
from datetime import datetime

import torch
from icecream import ic
from torch.optim import SGD, Adam, AdamW
from torch.optim.lr_scheduler import MultiStepLR, CosineAnnealingLR
from .lion_opt import Lion, AdamOnLion 

import deepthinking.models as models
from .mazes_data import prepare_maze_loader
from .prefix_sums_data import prepare_prefix_loader
from .chess_data import prepare_chess_loader #ADDED NEW
from .addition_data import prepare_addition_loader
from .. import adjectives, names

from .warmup import ExponentialWarmup, LinearWarmup

# Ignore statements for pylint:
#     Too many branches (R0912), Too many statements (R0915), No member (E1101),
#     Not callable (E1102), Invalid name (C0103), No exception (W0702),
#     Too many local variables (R0914), Missing docstring (C0116, C0115).
# pylint: disable=R0912, R0915, E1101, E1102, C0103, W0702, R0914, C0116, C0115


def generate_run_id():
    hashstr = f"{adjectives[random.randint(0, len(adjectives))]}-{names[random.randint(0, len(names))]}"
    return hashstr


def get_dataloaders(problem_args):
    if problem_args.name == "prefix_sums":
        return prepare_prefix_loader(train_batch_size=problem_args.hyp.train_batch_size,
                                     test_batch_size=problem_args.hyp.test_batch_size,
                                     train_data=problem_args.train_data,
                                     test_data=problem_args.test_data)
    elif problem_args.name == "mazes":
        return prepare_maze_loader(train_batch_size=problem_args.hyp.train_batch_size,
                                   test_batch_size=problem_args.hyp.test_batch_size,
                                   train_data=problem_args.train_data,
                                   test_data=problem_args.test_data)
    elif problem_args.name == "chess":
        return prepare_chess_loader(train_batch_size=problem_args.hyp.train_batch_size,
                                    test_batch_size=problem_args.hyp.test_batch_size,
                                    train_data=problem_args.train_data,
                                    test_data=problem_args.test_data)
    elif problem_args.name == "addition":
        return prepare_addition_loader(train_batch_size=problem_args.hyp.train_batch_size,
                                    test_batch_size=problem_args.hyp.test_batch_size,
                                    train_data=problem_args.train_data,
                                    test_data=problem_args.test_data)
    else:
        raise ValueError(f"Invalid problem spec. {problem_args.name}")


def get_model(model, width, max_iters, in_channels=3):
    model = model.lower()
    net = getattr(models, model)(width=width, in_channels=in_channels, max_iters=max_iters)
    print(net,'\n\n')
    return net


def get_optimizer(optim_args, model_args, net, state_dict, accelerator):
    optimizer_name = optim_args.optimizer.lower()
    epochs = optim_args.epochs
    lr = optim_args.lr * accelerator.num_processes # scale learning rate by number of processes
    lr_decay = optim_args.lr_decay
    lr_schedule = optim_args.lr_schedule
    lr_factor = optim_args.lr_factor
    warmup_period = optim_args.warmup_period

    if optim_args.lr_throttle:
        # Reducing the lr here for the recurrent layers helps with stability,
        # To date (July 21, 2021), we may only need this for maze models.
        base_params = [p for n, p in net.named_parameters() if "recur" not in n]
        recur_params = [p for n, p in net.named_parameters() if "recur" in n]
        iters = model_args.max_iters
        all_params = [{"params": base_params}, {"params": recur_params, "lr": lr / iters}]
    else:
        base_params = [p for n, p in net.named_parameters()]
        recur_params = []
        iters = 1
        all_params = [{"params": base_params}]

    wd = 4e-3 # weight decay

    if optimizer_name == "sgd":
        optimizer = SGD(all_params, lr=lr, weight_decay=wd, momentum=0.9)
    elif optimizer_name == "adam":
        optimizer = Adam(all_params, lr=lr, weight_decay=wd)
    elif optimizer_name == "adamw":
        optimizer = AdamW(all_params, lr=lr, weight_decay=wd)
    elif optimizer_name == "lion":
        optimizer = Lion(all_params, lr=lr, weight_decay=wd, betas=(0.9, 0.99), use_triton=True)
    elif optimizer_name == "adam_on_lion":
        optimizer = AdamOnLion(all_params, lr=lr, weight_decay=wd, betas=(0.9, 0.99))
    elif optimizer_name == "adamw_amsgrad":
        optimizer = AdamW(all_params, lr=lr, weight_decay=wd, amsgrad=True)
    else:
        raise ValueError(f"{ic.format()}: Optimizer choise of {optimizer_name} not yet implmented.")

    if state_dict is not None:
        optimizer.load_state_dict(state_dict)
        optimizer.param_groups[0]["capturable"] = True # make optimizer capturable=True
        warmup_scheduler = ExponentialWarmup(optimizer, warmup_period=0)
        # warmup_scheduler = LinearWarmup(optimizer, warmup_period=0)
    else:
        warmup_scheduler = ExponentialWarmup(optimizer, warmup_period=warmup_period)
        # warmup_scheduler = LinearWarmup(optimizer, warmup_period=warmup_period)

    if lr_decay.lower() == "step":
        lr_scheduler = MultiStepLR(optimizer, milestones=lr_schedule,
                                   gamma=lr_factor, last_epoch=-1)
    elif lr_decay.lower() == "cosine":
        lr_scheduler = CosineAnnealingLR(optimizer, epochs, eta_min=0, last_epoch=-1, verbose=False)
    else:
        raise ValueError(f"{ic.format()}: Learning rate decay style {lr_decay} not yet implemented.")

    return optimizer, warmup_scheduler, lr_scheduler


def load_model_from_checkpoint(problem, model_args, device, accelerator):
    model = model_args.model
    model_path = model_args.model_path
    width = model_args.width
    max_iters = model_args.max_iters
    epoch = 0
    optimizer = None
    new_state_dict = {}

    in_channels = 3
    if problem == "chess":
        in_channels = 12
    elif problem == 'addition':
        in_channels = 1

    net = get_model(model, width, in_channels=in_channels, max_iters=max_iters)
    net = net.to(device)
    if device == "cuda":
        net = net
    
    if model_path is not None and os.path.exists(model_path):
        logging.info(f"\n{'$'*50}\nLoading model from checkpoint {model_path}...\n{'$'*50}")
        state_dict = torch.load(model_path, map_location=device)

        # check if keys are prefixed with "module."
        new_state_dict = state_dict.copy()

        for key in list(new_state_dict["net"].keys()):
            new_key = key.replace('_orig_mod.', '') # remove _orig_mod. prefix
            new_state_dict["net"][new_key] = state_dict['net'][key]
            # remove old key
            del new_state_dict["net"][key]
        
        # Now load fixed state_dict
        net.load_state_dict(new_state_dict["net"])
        epoch = new_state_dict["epoch"] + 1
        optimizer = new_state_dict["optimizer"]
        accelerator.load_state(f"/fsx/DPT/outputs/{model_path}")

    return net, epoch, optimizer, accelerator


def now():
    return datetime.now().strftime("%Y%m%d %H:%M:%S")

Overwriting /kaggle/working/deep-thinking/deepthinking/utils/tools.py


## Config `YAML`

Hyperparameters for the model

In [4]:
%%writefile ./deep-thinking/config/problem/hyp/addition_default.yaml
alpha: 1
clip: 2
epochs: 300
lr: 9e-4 # AdamW -> 9e-4 is decent | Linear scaling rule - New_lr = lr * (new_batch_size / old_bsz)
lr_decay: cosine
lr_factor: 0.1
lr_schedule: #  CosineAnnealingLR Doesn't use this param
  - 2
lr_throttle: False
optimizer: adamw
save_period: -1
test_batch_size: 512
test_mode: default
train_batch_size: 512
train_mode: progressive
val_period: 15
warmup_period: 8 # We use Cosine warmup for the first 5 epochs, low warmup for DDP

Writing /kaggle/working/deep-thinking/config/problem/hyp/addition_default.yaml


- [ ] Look about using difference loss functions

In [5]:
%%writefile ./deep-thinking/config/problem/addition.yaml
defaults:
  - hyp: addition_default
  - model: dt_net_recall_1d

name: addition
test_data: 5_000
train_data: 50_000

model:
  model_path:
  width: 512
  max_iters: 15
  test_iterations:
    low: 5
    high: 10

Writing /kaggle/working/deep-thinking/config/problem/addition.yaml


## Executing the training for Arithmetic 🚀

Shape -> `(24, 20)`
- w denotes the width of the model
- Prefix sum used width = 400, so bump 128 → 400
- iterations can be pushed to [50,100]
- Epochs → 100
- Grid search for α ∈ [0,1]
- [x]  Try L1/L2 loss

`detnet_1d.py` has been modified
`training.py` has been modified
`testing.py` has been modified

In [None]:
from random import randrange
import torch
import os

%env HYDRA_FULL_ERROR=1
%cd /fsx/awesome/DPT/deep-thinking

# generate random port b/w 20000-30000
port = randrange(20_000, 30_000)
rdvz_id = randrange(100, 999)
#!python3 train_model.py problem=addition name=addition_run
# we have to launch above script with torchrun on a single host, with 8 GPUs
%env OMP_NUM_THREADS=2
%env PORT=$port
%env RDVZ_ID=$rdvz_id
#!torchrun --nproc_per_node=1 --nnodes=1 --rdzv_id=$RDVZ_ID --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:$PORT train_model.py problem=addition name=addition_run
# Using Huggigface accelerate with DDP and 1 GPU
#!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 train_model.py problem=addition name=addition_run

n_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {n_gpus}")
tpu_available = 'COLAB_TPU_ADDR' in os.environ

%env NUM_GPUS=$n_gpus

if n_gpus > 0:
    !accelerate launch --config_file /fsx/awesome/DPT/configs/acc_config.yaml --num_processes=$NUM_GPUS train_model.py problem=addition name=addition_run
elif tpu_available:
    print("TPU is available")
    !accelerate launch --config_file /fsx/awesome/DPT/configs/acc_tpu_config.yaml train_model.py problem=addition name=addition_run
else:
    print("No GPU or TPU available")
    !accelerate launch --config_file /fsx/awesome/DPT/configs/acc_cpu_config.yaml train_model.py problem=addition name=addition_run
