<a href="https://colab.research.google.com/github/pc-Jiang/math_in_deep_learning/blob/main/DSC_291_SP23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DSC 291 Mathmatics in Deep Learning, SP23
Advisor: Mikhail Belkin

Author: Pengcen Jiang, Bin Wang

## Import packages and set random seed.



In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import math
import numpy as np
import pandas as pd
from collections import OrderedDict
from copy import deepcopy

import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.metrics
import seaborn as sns
import random

USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
MAP_LOC = "cuda:0" if USE_CUDA else torch.device('cpu')

def set_seed(seed = 1234):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

## Define basic blocks and functions. 

### Define configurations class. 

In [37]:
class LearnFixedPointConfig(object):
    def __init__(self):
        # params for model
        self.hidden_size = 100
        self.nonlinearity = None
        self.dim_in = 10
        self.batch_size = 1
        self.initialization = None
        self.use_bias = True
        self.time_steps = 10
        self.ct = False
        self.tau = 0

        # params for training
        self.lr = 0.001
        self.optimizer_type = 'SGD'
        self.momentum = 0
        self.wdecay = 0
        self.num_ep = 1000
        self.add_noise = False

        # params for generating data
        self.std_fp = 1
        self.mean_fp = 0
        

    def update(self, new_config):
        self.__dict__.update(new_config.__dict__)

    def __str__(self):
        return str(self.__dict__)

### Define RNN class. 

In [39]:
class IdentityAct(object):
    def __call__(self, tensor):
        return tensor


class VanillaRNNCell(nn.Module):
    # (N, L, H_{in}): N, batch size, L, squence length, H_{in}, input size
    def __init__(self, config):
        super(VanillaRNNCell, self).__init__()
        self.input_size = config.dim_in
        self.output_size = config.dim_in
        self.hidden_size = config.hidden_size
        self.nonlinearity = config.nonlinearity
        self.use_bias = config.use_bias
        self.initialization = config.initialization
        self.ct = config.ct
        self.tau = config.tau

        self.weight_ih = nn.Parameter(
            torch.zeros((self.hidden_size, self.input_size)))
        self.weight_hh = nn.Parameter(
            torch.zeros((self.hidden_size, self.hidden_size)))
        self.weight_ho = nn.Parameter(
            torch.zeros((self.output_size, self.hidden_size)))
        self.bias = nn.Parameter(torch.zeros(self.hidden_size, 1))
        self.bias_out = nn.Parameter(torch.zeros(self.output_size, 1))
        self.reset_parameters()

        if self.nonlinearity is None:
            self.act = IdentityAct()
        elif self.nonlinearity == "tanh":
            self.act = torch.tanh
        elif self.nonlinearity == "relu":
            self.act = F.relu
        else:
            raise RuntimeError("Unknown nonlinearity: {}".format(
                self.nonlinearity))

    def reset_parameters(self):
        # add: bias-false
        stdv = 1.0 / math.sqrt(self.hidden_size)
        
        if self.initialization is None:
            for name, weight in self.named_parameters():
              if 'bias' in name:
                  if not self.use_bias:
                      weight.requires_grad = False
        elif self.initialization == 'uniform':
            for name, weight in self.named_parameters():
                if 'bias' in name:
                    if not self.use_bias:
                        weight.requires_grad = False
                    else:
                        nn.init.uniform_(weight, -stdv, stdv)
                else:
                    nn.init.uniform_(weight, -stdv, stdv)
        else:
            raise RuntimeError("Unknown initialization: {}".format(self.initialization)) 

    def forward(self, inp, hidden_in):
        inp = torch.unsqueeze(inp, 2)
        hidden_in = torch.unsqueeze(hidden_in, 2)
        if not self.ct:
            hidden_out = self.act(
                torch.matmul(self.weight_ih, inp) +
                torch.matmul( self.weight_hh, hidden_in) + self.bias)
        else:
            alpha = 1 / self.tau
            hidden_out = (1 - alpha) * hidden_in \
                         + alpha * self.act(torch.matmul(self.weight_ih, inp)
                                            + torch.matmul(self.weight_hh, hidden_in)
                                            + self.bias
                                            )

        output = torch.matmul(self.weight_ho, hidden_out) + self.bias_out
        hidden_out = torch.squeeze(hidden_out, 2)
        output = torch.squeeze(output, 2)

        return [hidden_out, output]

    def init_hidden(self, batch_s):
        return torch.zeros(batch_s, self.hidden_size).to(DEVICE)

### Define the function to get the fixed point. 

In [19]:
def get_fixed_point(config):
    dim = config.dim_in
    fixed_point = torch.randn(1, dim).to(DEVICE) * config.std_fp + config.mean_fp
    return fixed_point

def add_noise(fixed_point, mean=0, std=0.1):
    return fixed_point + torch.randn(fixed_point.size()).to(DEVICE) * std + mean

### Define the training function

In [43]:
def train_fixed_point(config, model, fixed_point):
    # initialize optimizer
    if config.optimizer_type == 'Adam':
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                        model.parameters()),
                        lr=config.lr,
                        weight_decay=config.wdecay)
    elif config.optimizer_type == 'SGD':
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                       model.parameters()),
                       lr=config.lr,
                       momentum=config.momentum,
                       weight_decay=config.wdecay)
    else:
        raise NotImplementedError('optimizer not implemented')

    criterion = nn.MSELoss()
    task_loss_list = []
    ep_list = []

    count = 1
    for ep in range(config.num_ep):
        hidden = model.init_hidden(config.batch_size)
        model.train()
        loss = 0.0 
        optimizer.zero_grad()
        for step in range(config.time_steps):
            
            if config.add_noise:
                fixed_point = add_noise(fixed_point)
            [hidden, output] = model(fixed_point, hidden)
            task_loss = criterion(output, fixed_point)
            loss += task_loss

        loss.backward()
        optimizer.step()
        task_loss_list.append((loss/config.time_steps).item())
        ep_list.append(count)
        if count % 100 == 0: 
            print('TRAIN | Epoch: {}/{} | Loss: {:.2f}'.format(ep+1, config.num_ep, loss/config.time_steps))
        count += 1

    return model, optimizer, task_loss_list, ep_list


### Define the function to get different configs

In [57]:
def vary_config(base_config, config_ranges, mode,
                num_seed=1, default_name=False):
    """Return configurations.
    adapted from https://github.com/gyyang/olfaction_evolution
    Args:
        base_config: BaseConfig object, a base configuration
        config_ranges: a dictionary of hyperparameters values
            config_ranges = {
                'hp1': [hp1_val1, hp1_val2, ...],
                'hp2': [hp2_val1, hp2_val2, ...],
            }
        mode: str, can take 'combinatorial' or 'sequential'
        num_seed: int, number of random seeds
        default_name: bool, whether to use auto_name function
    Return:
        config_df: a pandas data frame of configs,
            each row is a config, each column is a variation of parameter
    """
    assert 'seed' not in config_ranges.keys(), 'seed cannot be specified in config range'

    keys = config_ranges.keys()
    dims = [len(config_ranges[k]) for k in keys]

    attribute_dict = {}
    for key in keys:
        attribute_dict[key] = []
    attribute_dict['seed'] = []
    attribute_dict['config'] = []

    # Return combinatorial configurations,
    # config_ranges should not have repetitive values
    if mode == 'combinatorial':
        n_max = int(np.prod(dims))
        assert n_max > 0
        for seed in range(num_seed):
            for i in range(n_max):
                new_config = deepcopy(base_config)
                # Set up new config
                new_config.seed = seed
                indices = np.unravel_index(i, shape=dims)
                for key, index in zip(keys, indices):
                    val = config_ranges[key][index]
                    setattr(new_config, key, val)
                    attribute_dict[key].append(val)

                attribute_dict['seed'].append(seed)
                attribute_dict['config'].append(new_config)

    # Return sequential configurations.
    # config_ranges values should have equal length,
    # otherwise this will only loop through the shortest one
    elif mode == 'sequential':
        n_max = np.min(dims)
        assert n_max > 0
        for seed in range(num_seed):
            for i in range(n_max):
                new_config = deepcopy(base_config)
                # Set up new config
                new_config.seed = seed
                for key in keys:
                    val = config_ranges[key][i]
                    setattr(new_config, key, val)
                    attribute_dict[key].append(val)

                attribute_dict['seed'].append(seed)
                attribute_dict['config'].append(new_config)
    else:
        raise ValueError('Unknown mode {}'.format(str(mode)))

    configs_df = pd.DataFrame(attribute_dict)

    for i, row in configs_df.iterrows():
        config = row.loc['config']
        if default_name:
            config.model_name = str(i).zfill(6)
        else:
            name = 'model'
            for key in keys:
                name += '_' + str(key) + str(row.loc[key])

            # replace char that are not suitable for path
            name = name.replace(",", "").replace(" ", "_")
            name = name.replace("[", "_").replace("]", "_").replace(".", "_")
            name = name.replace("'", "")
            config.model_name = name + '_s' + str(row.loc['seed'])

    return configs_df

def configs_df_unpack(configs_df):
    """
    unpack configs_df to a list of configs
    args:
        configs_df: pandas.DataFrame, that contains all configs in an experiment
    return:
        config_list: unpacked list of configs
    """
    config_list = list(configs_df.loc[:, 'config'])
    return config_list

## Train the network. 

In [60]:
def learn_fixed_point():
    config = LearnFixedPointConfig()
    config_ranges = OrderedDict()
    config_ranges['dim_in'] = [10, 50, 100]
    config_ranges['nonlinearity'] = [None, 'relu']
    config_ranges['initialization'] = [None, 'uniform']
    configs = vary_config(config,
                config_ranges,
                mode='combinatorial',
                num_seed=1)
    return configs

In [61]:
configs_df = learn_fixed_point()
configs = configs_df_unpack(configs_df)
for config in configs:
    print(config)
    model = VanillaRNNCell(config)
    fixed_point = get_fixed_point(config)
    print('Fixed point {} \n'.format(fixed_point))
    model, optimizer, task_loss_list, ep_list = train_fixed_point(config, model, fixed_point)
    print('-------------------------------------------------------------------')

{'hidden_size': 100, 'nonlinearity': None, 'dim_in': 10, 'batch_size': 1, 'initialization': None, 'use_bias': True, 'time_steps': 10, 'ct': False, 'tau': 0, 'lr': 0.001, 'optimizer_type': 'SGD', 'momentum': 0, 'wdecay': 0, 'num_ep': 1000, 'add_noise': False, 'std_fp': 1, 'mean_fp': 0, 'seed': 0, 'model_name': 'model_dim_in10_nonlinearityNone_initializationNone_s0'}
Fixed point tensor([[ 0.1442, -0.8658,  0.0773, -1.7251,  0.5583,  1.3073,  0.2807,  0.4634,
          1.8128, -0.8622]]) 

TRAIN | Epoch: 100/1000 | Loss: 0.68
TRAIN | Epoch: 200/1000 | Loss: 0.46
TRAIN | Epoch: 300/1000 | Loss: 0.30
TRAIN | Epoch: 400/1000 | Loss: 0.20
TRAIN | Epoch: 500/1000 | Loss: 0.14
TRAIN | Epoch: 600/1000 | Loss: 0.09
TRAIN | Epoch: 700/1000 | Loss: 0.06
TRAIN | Epoch: 800/1000 | Loss: 0.04
TRAIN | Epoch: 900/1000 | Loss: 0.03
TRAIN | Epoch: 1000/1000 | Loss: 0.02
-------------------------------------------------------------------
{'hidden_size': 100, 'nonlinearity': None, 'dim_in': 10, 'batch_size'