In [1]:
import numpy as np
import scipy
from chronos import BaseChronosPipeline
import torch
from torch import nn
import matplotlib.pyplot as plt
import json

In [4]:
class Custom_linear(nn.Module):
    def __init__(self, weights):
        super().__init__()
        self.layer = nn.Linear(in_features = weights.shape[1],
                               out_features =  weights.shape[0], bias = False)
        self.layer.weight = weights
        self.c = 0
        self.auto_cor_matrix = torch.zeros(768, 768)
        self.auto_cor_matrix = self.auto_cor_matrix.to('cuda')
        
    def forward(self, x):
        self.c += 1

        tmp = x.view(-1,768)
        
        self.auto_cor_matrix += tmp.T @ tmp
        
        return self.layer(x)
        
def process_linear_layer(linear_layer):
    weights = None
    for p in linear_layer.parameters():
            weights = p
    return Custom_linear(weights) 

def process_model(model):
    for i in range(12):
        
        model.encoder.block[i].layer[0].SelfAttention.q =\
        process_linear_layer(model.encoder.block[i].layer[0].SelfAttention.q)

        model.encoder.block[i].layer[0].SelfAttention.k =\
        process_linear_layer(model.encoder.block[i].layer[0].SelfAttention.k)

        model.encoder.block[i].layer[0].SelfAttention.v =\
        process_linear_layer(model.encoder.block[i].layer[0].SelfAttention.v)

        model.encoder.block[i].layer[0].SelfAttention.o =\
        process_linear_layer(model.encoder.block[i].layer[0].SelfAttention.o)


        model.decoder.block[i].layer[0].SelfAttention.q =\
        process_linear_layer(model.decoder.block[i].layer[0].SelfAttention.q)

        model.decoder.block[i].layer[0].SelfAttention.k =\
        process_linear_layer(model.decoder.block[i].layer[0].SelfAttention.k)

        model.decoder.block[i].layer[0].SelfAttention.v =\
        process_linear_layer(model.decoder.block[i].layer[0].SelfAttention.v)

        model.decoder.block[i].layer[0].SelfAttention.o =\
        process_linear_layer(model.decoder.block[i].layer[0].SelfAttention.o)

In [20]:
class Reduced_Linear(nn.Module):
    def __init__(self, dense_linear_layer, rank = 1000, type = 'matrix'):
        super(Reduced_Linear, self).__init__()
        
        matrix = None
        
        for p in dense_linear_layer.parameters():
            matrix = p.detach()

        if type == 'matrix':
        
            U, S, Vh = torch.linalg.svd(matrix.float(), full_matrices=False)
            S = S[:rank]
            U = U[:, :rank]
            Vh = Vh[:rank,]
            self.linear1 =  nn.Linear(in_features=matrix.shape[1], out_features=rank, bias=False)
            self.linear2 =  nn.Linear(in_features=rank, out_features=matrix.shape[0], bias=False)
            
            with torch.no_grad():
                for p in self.linear1.parameters():
                    p.copy_((torch.diag(S)@Vh))
                
                for p in self.linear2.parameters():
                    p.copy_(U)
                    
            self.linear1 = self.linear1.to('cuda')
            self.linear2 = self.linear2.to('cuda')
            
        else:
            
            R_x = dense_linear_layer.auto_cor_matrix.float().cpu().numpy() / dense_linear_layer.c
            R_sq = scipy.linalg.sqrtm(R_x + np.diag(np.ones(768)/10))
        
            R_fract_inv = np.linalg.inv(R_sq)
            
            Q = R_sq @ matrix.float().cpu().numpy()
            U, S, V = np.linalg.svd(Q)

            self.linear1 =  nn.Linear(in_features=matrix.shape[1], out_features=rank, bias=False)
            self.linear2 =  nn.Linear(in_features=rank, out_features=matrix.shape[0], bias=False)

            with torch.no_grad():
                for p in self.linear1.parameters():
                    p.copy_(
                        torch.diag(torch.tensor(S[:rank])).float() @ torch.tensor(V[:rank,:]).float()
                    )
                
                for p in self.linear2.parameters():
                    p.copy_(
                        torch.tensor(R_fract_inv @ U[:,:rank])
                    )
                    
            self.linear1 = self.linear1.to('cuda')
            self.linear2 = self.linear2.to('cuda')
                
    def forward(self, x):
        out = self.linear1(x)
        out = self.linear2(out)
        return out

In [7]:
def reduce_model(model, rank, type):
    for i in range(0,12):
        print(i)
        model.encoder.block[i].layer[0].SelfAttention.q =\
        Reduced_Linear(model.encoder.block[i].layer[0].SelfAttention.q, rank, type)

        model.encoder.block[i].layer[0].SelfAttention.k =\
        Reduced_Linear(model.encoder.block[i].layer[0].SelfAttention.k, rank, type)

        model.encoder.block[i].layer[0].SelfAttention.v =\
        Reduced_Linear(model.encoder.block[i].layer[0].SelfAttention.v, rank, type)

        model.encoder.block[i].layer[0].SelfAttention.o =\
        Reduced_Linear(model.encoder.block[i].layer[0].SelfAttention.o, rank, type)

        '''
        model.decoder.block[i].layer[0].SelfAttention.q =\
        Reduced_Linear(model.decoder.block[i].layer[0].SelfAttention.q, rank, type)

        model.decoder.block[i].layer[0].SelfAttention.k =\
        Reduced_Linear(model.decoder.block[i].layer[0].SelfAttention.k, rank, type)

        model.decoder.block[i].layer[0].SelfAttention.v =\
        Reduced_Linear(model.decoder.block[i].layer[0].SelfAttention.v, rank, type)

        model.decoder.block[i].layer[0].SelfAttention.o =\
        Reduced_Linear(model.decoder.block[i].layer[0].SelfAttention.o, rank, type)
        '''

In [8]:
def evaluate_method(data_train,data_test, method, rank):
    horizon = data_test.shape[1]

    pipeline = BaseChronosPipeline.from_pretrained(
            "amazon/chronos-bolt-base",
            device_map="cuda",  # use "cpu" for CPU inference and "mps" for Apple Silicon
            torch_dtype=torch.bfloat16,
                
        )
    process_model(pipeline.inner_model)
    forecast = pipeline.predict(context=data_train.to('cuda'),
                                prediction_length=horizon)

    reduce_model(pipeline.inner_model, rank, method)
    
    pipeline.inner_model.float()
    forecast = pipeline.predict(context=data_train.to('cuda'),
                                prediction_length=horizon)
    
    forecast = forecast.float().cpu().numpy()[:,4]

    return forecast

In [21]:
data_train = []
data_test = []
with open('taxi_data/taxi_30min/train/train.json', 'r') as f:
    for l in f:
        x = torch.tensor(json.loads(l)['target'])
        cur_train = []
        cur_test = []
        #cur_train = [x[:512], x[512:1024]]
        #cur_test = [x[512:512+10], x[1024:1024+10]]
        for i in range(10):
            cur_train.append(
                x[i+10:i+10+512]
            )
            cur_test.append(
                x[i+10+512:i+10+512+10]
            )
        data_train.append(torch.cat(cur_train).view(-1,512))
        data_test.append(torch.cat(cur_test).view(-1,10))

In [22]:
data_train = torch.cat(data_train).view(-1, 512)
data_test = torch.cat(data_test).view(-1, 10)

In [35]:
f = evaluate_method(data_train, data_test, 'matrix', 150)

0
1
2
3
4
5
6
7
8
9
10
11


In [36]:
np.mean(np.sqrt(np.mean((data_test.numpy() - f)**2, axis = 1)))

1.7865189