In [1]:
import sys
sys.path.append('structural_bloc_circ_utils')

In [2]:
import ast
import datetime
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pytz
import time

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [3]:
from StructuralBlocCircModel import StructuralBlocCircModel

### test block circulant matrix multiplication

In [4]:
from bloc_circ_linear_layer import BlocCircLinear

In [5]:
# Initialize the BlocCircLinear layer with the specified dimensions
row_dim = 8
col_dim = 12
n_blocks = 4
layer = BlocCircLinear(row_dim=row_dim, col_dim=col_dim, n_blocks=n_blocks)

# Generate a random input tensor x with batch size 3 and appropriate input dimensions
batch_size = 3
x = torch.randn(batch_size, col_dim)

# Method 1: Use the forward method of BlocCircLinear to perform matrix multiplication
y_forward = layer(x)

# Method 2: Manually get the full circulant matrix A and perform matrix multiplication
A = layer.get_full_circ_mat()
y_manual = torch.matmul(x, A.T)  # A.T because we're multiplying from the right

# Print the results
print("Result from forward method:")
print(y_forward)

print("\nResult from manual multiplication with full matrix A:")
print(y_manual)

# Check if the results are close
if torch.allclose(y_forward, y_manual, atol=1e-6):
    print("\nThe results match!")
else:
    print("\nThere is a discrepancy between the results.")


Result from forward method:
tensor([[ 5.5332,  0.8648,  3.4750,  1.9541, -5.0767, -5.2086,  0.9053,  3.0476],
        [ 3.9707, -2.1824,  1.4220,  3.8938, -8.7572,  0.2643,  0.2022, -1.6428],
        [-2.0725,  4.2697,  3.9691, -5.7047, -4.0770,  1.2386, -1.6909,  0.7616]],
       grad_fn=<ViewBackward0>)

Result from manual multiplication with full matrix A:
tensor([[ 5.5332,  0.8648,  3.4750,  1.9541, -5.0767, -5.2086,  0.9053,  3.0476],
        [ 3.9707, -2.1824,  1.4220,  3.8938, -8.7572,  0.2643,  0.2022, -1.6428],
        [-2.0725,  4.2697,  3.9691, -5.7047, -4.0770,  1.2386, -1.6909,  0.7616]],
       grad_fn=<MmBackward0>)

The results match!


### train and save models

In [4]:
with open('configs__StructuralCircDiagModels__bias=False_nonlinearity=SiLU__every_layer_except_last.json', 'r') as file:
    bloc_circ_configs = json.load(file)

In [5]:
# For a different MNIST-like dataset, replace 'fashion-MNIST' with 'MNIST' or 'Kuzushiji-MNIST'
training_config = {
    'dataset': 'fashion-MNIST',
    'starting_learning_rate': 0.005,
    'weight_decay': 2e-5,
    'scheduler': 'cosine annealing',
    'T_max': 200,
    'eta_min': 1e-05,
    'stopping_epoch': 200,
    'train_loader_batch_size': 512
}

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [7]:
train_MNIST = torchvision.datasets.MNIST(root="/tmp", train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_MNIST = torchvision.datasets.MNIST(root="/tmp", train=False, transform=torchvision.transforms.ToTensor(), download=True)

train_fashion_MNIST = torchvision.datasets.FashionMNIST(root="/tmp", train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_fashion_MNIST = torchvision.datasets.FashionMNIST(root="/tmp", train=False, transform=torchvision.transforms.ToTensor(), download=True)

train_Kuzushiji_MNIST = torchvision.datasets.KMNIST(root="/tmp", train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_Kuzushiji_MNIST = torchvision.datasets.KMNIST(root="/tmp", train=False, transform=torchvision.transforms.ToTensor(), download=True)

In [None]:
import random

list_rand_seeds = [random.randint(0, 2**32 - 1) for _ in range(8)]
print(list_rand_seeds)

In [None]:
EST_time_zone = pytz.timezone('US/Eastern')


list_config_names = ['config2', 'config3', 'config4', 'config5', 'config6', 'config7']

for config_name in list_config_names:
    for rand_seed in list_rand_seeds:

        dict_to_save = {
            'model_name': bloc_circ_configs[config_name]['model_name'],
            'dataset': training_config['dataset'],
            'layer_sizes': bloc_circ_configs[config_name]['layer_sizes'],
            'config_name': bloc_circ_configs[config_name]['config_name'],
            'n_blocks_list': bloc_circ_configs[config_name]['n_blocks_list'],
            'if_bias': bloc_circ_configs[config_name]['if_bias'],
            'nonlinear_activation': bloc_circ_configs[config_name]['nonlinear_activation'],
            'starting_learning_rate': training_config['starting_learning_rate'],
            'weight_decay': training_config['weight_decay'],
            'scheduler': training_config['scheduler'],
            'T_max': training_config['T_max'],
            'eta_min': training_config['eta_min'],
            'stopping_epoch': training_config['stopping_epoch'],
            'train_loader_batch_size': training_config['train_loader_batch_size'],
            'random_seed': rand_seed
        }

        print(dict_to_save)


        ### model folder directory
        dir_sbc_model_folder = (
            f"structural_bloc_circ_dataset={dict_to_save['dataset']}"
            f"\\models__dataset={dict_to_save['dataset']}_"
            f"bias={dict_to_save['if_bias']}_"
            f"nonlinearity={dict_to_save['nonlinear_activation']}"
        )
        df_path = os.path.join(f"structural_bloc_circ_dataset={dict_to_save['dataset']}",
                               f"test_results__SBCmodels__dataset={dict_to_save['dataset']}"
                               f"_bias={dict_to_save['if_bias']}"
                               f"_nonlinearity={dict_to_save['nonlinear_activation']}.csv")
        if os.path.exists(df_path):
            df_results = pd.read_csv(df_path)
        else:
            df_results = pd.DataFrame()


        if dict_to_save['dataset'] == 'MNIST':
            train_data = train_MNIST
            test_data  = test_MNIST
        elif dict_to_save['dataset'] == 'fashion-MNIST':
            train_data = train_fashion_MNIST
            test_data  = test_fashion_MNIST
        elif dict_to_save['dataset'] == 'Kuzushiji-MNIST':
            train_data = train_Kuzushiji_MNIST
            test_data  = test_Kuzushiji_MNIST
        else:
            raise ValueError("Unknown dataset name")
        

        torch.manual_seed(dict_to_save['random_seed'])
        np.random.seed(dict_to_save['random_seed'])
        
        train_loader = DataLoader(dataset=train_data, shuffle=True, batch_size=dict_to_save['train_loader_batch_size'])
        test_loader = DataLoader(dataset=test_data, shuffle=False, batch_size=1000)


        # Initialize the model
        sbc_model = StructuralBlocCircModel(
            layer_sizes=dict_to_save['layer_sizes'],
            n_blocks_list=dict_to_save['n_blocks_list'],
            if_bias=dict_to_save['if_bias'],
            nonlinearity=dict_to_save['nonlinear_activation']
        )
        sbc_model.to(device)
        
        # Initialize the loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(sbc_model.parameters(),
                                lr=dict_to_save['starting_learning_rate'],
                                weight_decay=dict_to_save['weight_decay'])

        if dict_to_save['scheduler'] == 'cosine annealing':
            scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=dict_to_save['T_max'], eta_min=dict_to_save['eta_min'])
        else:
            raise ValueError("scheduler is unspecified")


        t_start = time.time()

        train_losses = []
        train_accuracies = []


        for epoch in range(dict_to_save['stopping_epoch']):
            epoch_start_time = time.time()  # Start timing the epoch
        
            sbc_model.train()
            running_loss = 0.0
            correct_train = 0
            total_train = 0
        
            for images, labels in train_loader:
                images, labels = images.to(device), labels.to(device)
        
                # Resize images from 28x28 (784) to 800
                images = images.view(images.size(0), -1)
                images = nn.functional.pad(images, (0, 16))  # Add 16 zeros to match size 800
        
                optimizer.zero_grad()
                outputs = sbc_model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
        
                running_loss += loss.item()
        
                # Calculate training accuracy
                _, predicted = torch.max(outputs.data, 1)
                total_train += labels.size(0)
                correct_train += (predicted == labels).sum().item()
        
            scheduler.step()
        
            # Save loss and accuracy only every 10 epochs
            if (epoch + 1) % 10 == 0:
                train_loss = running_loss / len(train_loader)
                train_accuracy = 100 * correct_train / total_train
        
                train_losses.append(train_loss)
                train_accuracies.append(train_accuracy)
        
                print(f"Epoch [{epoch+1}/{dict_to_save['stopping_epoch']}], Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%")

                elapsed_time = time.time() - epoch_start_time
                print(f"Time for Epoch [{epoch+1}/{dict_to_save['stopping_epoch']}]: {elapsed_time:.2f} seconds")

            
            # Check if it's the last epoch
            if (epoch + 1) == dict_to_save['stopping_epoch']:
                final_train_loss = running_loss / len(train_loader)
                final_train_accuracy = 100 * correct_train / total_train
        
                sbc_model.eval()
                correct_test = 0
                total_test = 0
                with torch.no_grad():
                    for images, labels in test_loader:
                        images, labels = images.to(device), labels.to(device)
                        images = images.view(images.size(0), -1)
                        images = nn.functional.pad(images, (0, 16))
        
                        outputs = sbc_model(images)
                        _, predicted = torch.max(outputs.data, 1)
                        total_test += labels.size(0)
                        correct_test += (predicted == labels).sum().item()
        
                final_test_accuracy = 100 * correct_test / total_test
                print(f"Final Training Loss: {final_train_loss:.4f}, Final Training Accuracy: {final_train_accuracy:.2f}%, Final Test Accuracy: {final_test_accuracy:.2f}%")
        
                
        # Report epoch duration
        t_end = time.time()
        

        current_time_est = datetime.datetime.now(EST_time_zone)
        time_str = current_time_est.strftime("%Y-%m-%d_%H-%M-%S")


        dict_to_save['time'] = time_str
        dict_to_save['train_losses'] = ', '.join(f"{loss:.8f}" for loss in train_losses) # save in string format
        dict_to_save['train_accuracies'] = ', '.join(f"{loss:.4f}" for loss in train_accuracies) # save in string format
        dict_to_save['train_accuracy'] = final_train_accuracy
        dict_to_save['test_accuracy'] = final_test_accuracy
        dict_to_save['time_consumption(s)'] = t_end - t_start
        
        
        plt.figure(figsize=(10, 5))
        plt.plot(list(range(10, dict_to_save['stopping_epoch']+1, 10)), train_accuracies, label='test accuracy')
        plt.xlabel('epoch')
        plt.ylabel('test accuracy')
        plt.title(f"{dict_to_save['config_name']}, rand_seed={dict_to_save['random_seed']}")
        plt.show()
        
        
        # save result to spreadsheet
        df_results = pd.concat([df_results, pd.DataFrame([dict_to_save])], ignore_index=True)
        df_results.to_csv(df_path, index=False)
        
        
        ### save the model
        torch.save(sbc_model.state_dict(), f"{dir_sbc_model_folder}/{dict_to_save['model_name']}__data={dict_to_save['dataset']}__{time_str}.pt")
        torch.save(optimizer.state_dict(), f"{dir_sbc_model_folder}/optimizer__{dict_to_save['model_name']}__data={dict_to_save['dataset']}__{time_str}.pt")
