# Compiler

## Network container class

In [26]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class Program:
    # this class contains in a structured manner matrices and biases
    # which encode the semantics of the program network, in the same fashion
    # of the source code of a program
    def __init__(self, topology , activation_functions_list):
        self.topology = topology
        self.activation_functions_list = activation_functions_list
        self.W = [ 0 for t in topology[:-1]]
        self.b = [ 0 for t in topology[:-1]]
    def random_init_weights(self):
        for layer in range(len(self.topology[:-1])):
            self.W[layer] = np.random.normal( size = (self.topology[layer + 1] , self.topology[layer] ) )**2
            size = self.W[layer].shape[0] * self.W[layer].shape[1] 
            for i in range(size - int(np.sqrt(size))):
                r = np.random.choice(len(self.W[layer]))
                c = np.random.choice(len(self.W[layer].T))
                self.W[layer][r,c] = 0.   # adding some sparsity
                
            self.b[layer] = np.random.normal( size = self.topology[layer + 1] )
    def print(self):
        for i,layer in enumerate(self.topology[:-1]):
            print("layer %d->%d "%(i,i+1))
            print("W.shape = %s \t b.shape = %s" % (str(self.W[i].shape),str(self.b[i].shape)))
    def visualize_weights(self):
        for w in self.W:
            print(w.shape)
            plt.figure(figsize=(10,10))
            plt.imshow(w > 0.)        

## Sparsification algorithm

## IR production

### Lightweight trees representation

In [28]:
class c:
    # every element is a node in the tree.
    # the first argument denotes the name of the node ; the possible next arguments are a list of the sons.
    # Note that nodes without sons are simply leaf (eg. ADD, MUL,...)
    def __init__(self, ID, *args):
        self.id   = ID
        self.sons = list()
        for arg in args:
            self.sons.append(arg)
    def print(self,level = 0):
        print( ("\t" * level) + str(self.id) )
        for s in self.sons:
            s.print(level + 1)
    def __str__(self):
        ret = str(self.id)
        if len(self.sons) > 0:
            ret += "("
            for s in self.sons:
                ret += str(s)
                if s != self.sons[-1]:
                    ret += ','
            ret += ")"
        return ret
    def flatten(self):
        ret = []
        ret += [self.id]
        for s in self.sons:
            ret += s.flatten()
        return ret

### IR production function

In [185]:
def IR(program, compile_time_data = True):
    # for every layer
    ### print("y = input")
    ret = list()
    
    for layer,t in enumerate(program.topology[1:]):
        IR_instruction = c("COMMENT", c("START"))
        ### print(IR_instruction)
        ret.append(IR_instruction)
        
        f = program.activation_functions_list[layer]
        R_offset = t
        for i in range(len(program.b[layer])): # for every row
            #print("R[%d] = 0" % (i))
            #print("MOVE( TEMP(%d) , CONST(0) )" % (i + R_offset))
            IR_instruction = c(
                    "MOVE",
                     c("TEMP" , c(i + R_offset) ),
                     c("CONST", c(0))
            )
            ### print(str(IR_instruction))
            ret.append(IR_instruction)
            
            
        for i in range(len(program.W[layer])): # for every row
            for j in range(len(program.W[layer].T)): # for every column
                if( program.W[layer][i,j] != 0):
                    # if there is zero is useless to compute the contribution
                    #print("->MOVE(TEMP(%d),BINOP(ADD,TEMP(%d),BINOP(MUL,CONST(%f),TEMP(%d)))) " % (i + R_offset,i + R_offset, program.W[layer][i,j],i )) # COMPILE DATA HYPOTHESIS
                    IR_instruction = c(
                        "MOVE",
                        c("TEMP",
                            c(i + R_offset)
                        ),
                        c("BINOP",
                            c("ADD"),
                            c("TEMP",
                                c(i + R_offset)
                            ),
                            c("BINOP",
                                c("MUL"),
                                c("CONST",
                                     c(program.W[layer][i,j])
                                 ),
                                c("TEMP",
                                 c(j)
                                 )
                            )
                        )
                    )
                    #print(IR_instruction)
                    ret.append(IR_instruction)
                else:
                    0
                    ### print("# here there was a 0 so we exploit sparsity ")
                    
        for i in range(len(program.b[layer])): # for every row
            # a priori in compile time since the amount of "repetitions" doens't scale quadratically , in opposite to weights
            #print("->MOVE(TEMP(%d),BINOP(ADD,TEMP(%d),CONST(%f)))" % (i, i + R_offset , program.b[layer][i]) )
            IR_instruction = c("MOVE",
                                c("TEMP",
                                 c(i)
                                 ),
                                c("BINOP",
                                 c("ADD"),
                                 c("TEMP",
                                   c(i + R_offset)
                                  ),
                                 c("CONST",
                                  c(program.b[layer][i])
                                  )
                                 )
                                )
            ### print((IR_instruction))
            ret.append(IR_instruction)
        for i in range(len(program.b[layer])): # for every row
            #print("y[%d] = f(y[%d])" % (i,i))
            #print("MOVE( TEMP(%d) , CALL( %s, TEMP(%d) ) )" % (i,f,i) )
            IR_instruction = c("MOVE",
                c("TEMP",
                     c(i)
                 ),
                c("CALL",
                     c(f),
                     c("TEMP",
                          c(i)
                      )
                 )
            )
            ### print(IR_instruction)
            ret.append(IR_instruction)
        IR_instruction = c("COMMENT", c("END"))
        ### print(IR_instruction)
        ret.append(IR_instruction)
    return ret


## Register and memory allocation

### Temporary variables statistics class

In [186]:
class TemporaryVariablesStatistics:
    def __init__(self):
        self.temp_usage_map = {}
    def increment(self,temp_variable):
        old_value = self.temp_usage_map.get(temp_variable)
        if old_value == None:
            old_value = 0
        self.temp_usage_map[temp_variable] = old_value + 1
    def get_data(self):
        return self.temp_usage_map
    def vectorize(self):
        arr = []
        for s in self.get_data():
            arr.append( [ s, self.get_data()[s]] )
        arr = np.array(arr)                                                              # builds a tempstable [ temp | usage ]
        arr = arr[ arr[:,1].argsort()[-1::-1] ]                                          # sort the tempstable by usage  (decreasing)
        return arr


### Register allocation data class

In [187]:
class RegisterAllocationData:
    def __init__(self):
        self.temp_reg_map = {}
    def insert(self,temp_variable, register):
        self.temp_reg_map[temp_variable] = register
    def get_data(self):
        return self.temp_reg_map
    def print(self):
        for t in self.temp_reg_map:
            print(t , "\t", self.temp_reg_map[t] )
    def contains(self,tmp_name):
        return tmp_name in self.temp_reg_map

### Memory allocation data class

In [188]:
class MemoryAllocationData:
    def __init__(self):
        self.temp_mem_map = {}
    def insert(self,temp_variable, address):
        self.temp_mem_map[temp_variable] = address
    def get_data(self):
        return self.temp_mem_map
    def get_variables_list(self):
        ret = list()
        for s in self.temp_mem_map:
            ret.append(s)
        return ret
    def print(self):
        for t in self.temp_mem_map:
            print(t , "\t", self.temp_mem_map[t] )

### Block Signals

In [189]:
class BlockSignals:
    def __init__(self, memory_allocation_object):
        # initialize an empty dictionary starting from the variables name
        self.memory_allocation_object = memory_allocation_object
        self.temp_signals_map = {}
        for t in memory_allocation_object.get_data():
            self.temp_signals_map[t] = []
        
    def add_tick(self, temp_variables):
        temp_variables          = np.intersect1d(temp_variables, self.memory_allocation_object.get_variables_list())
        all_temporary_variables = self.memory_allocation_object.get_variables_list()
        # push 0 in the lists of unused temps and 1 in the list of the used temp
        for t in all_temporary_variables:
            self.temp_signals_map[t].append(0)
        for t in temp_variables:
            self.temp_signals_map[t][-1] = 1.
            
    def get_data(self):
        return self.temp_signals_map

### Embedding

### Theoretical argument : memory allocation

Suppose that we have some kind of distance between temporary variables in a <b>matrixmult</b>. <br>
Therefore we have a collection of distance matrices $\{D_{1,2},D_{2,3},...,D_{N-1,N}\}$ <br>
We can formulate the following optimization problem <br><br>
$
    \text{Find $\{P_{1,2},P_{2,3},...,P_{N-1,N}\}$ permutations of 
    $\{ [1,n_{1,2}], [1,n_{2,3}],... ,[1,n_{N-1,N}] \}$ }
$ <br>
$
\text{such that
    $D(P_{i,i+1}) \sim D_{i,i+1} \ \ \ \forall i$
}
$ <br>
$
\text{Subject to
    $ (P_{i,i+1})_{\text{output (only memory)}} = (P_{i+1,i+2})_{\text{input (only memory)}}  \ \ \forall i$
}
$

### Proposed algorithm

$
\text{$P_0 \leftarrow$ Permutation$(Temp_{\mathcal l,\mathcal l + 1})$ optimal}\\
\text{${\bf for } \ \ \ i,(layer,layer+1) \in temps $ : } \\
\hspace{2em} \text{$ P_i \leftarrow $ Permutation$(Temp_{\mathcal l_i,\mathcal l_{i+1}})$ optimal subject to $P_i^{input} = P_{i-1}^{output}$ }
$

### Constrained Permutation Class

In [39]:
class constrainedPermutation:
    def __init__(self, constraint, trivial_init = True, init_vector = None):
        # constraint = vector of -1, with the exception of the "fixed points" of the permutations
        #              e.g.   -1 -1 4 3 -1
        #              reads as "all the permutations of 01234" such that the third element is 4 and the fourth is 3
        self.constraint = constraint
        if trivial_init == True:
            self.value      = np.arange(len(constraint))
            for i in np.arange(len(constraint))[constraint != -1]:
                # find the value at the position i
                to_swap_A = i                                      # position 1
                to_swap_B = np.argmax(self.value == constraint[i]) # position 2
                tmp       = self.value[to_swap_A]
                self.value[to_swap_A] = self.value[to_swap_B]
                self.value[to_swap_B] = tmp
        else:
            self.value = init_vector.copy()
    
    def get_neighbor(self):
        feasible = np.arange(len(self.constraint))[self.constraint == -1]
        old = self.value[feasible]
        a        = np.random.choice(len(feasible))
        b        = np.random.choice(len(feasible))
        temp     = feasible[a]
        feasible[a] = feasible[b]
        feasible[b] = temp
        ret = self.value.copy()
        ret[feasible] = old
        return constrainedPermutation(self.constraint,False,ret)
    
    def get_indexes(self):
        return self.value

    def print(self):
        print(self.value)
ret = list()
for i in range(10000):
    ret.append( constrainedPermutation(np.array([-1.,-1.,2,3,4,-1,-1,-1])).get_neighbor().value )
ret = np.array(ret)
ret.mean(axis = 0)

array([1.5267, 2.1128, 2.    , 3.    , 4.    , 4.4904, 5.1339, 5.7362])

### Allocator class

In [190]:
from sklearn.manifold import MDS
class Allocator:
    def __init__(self, ir, register_names):
        self.register_allocation_data = []  # register allocations for every matrix multiplication operation
        self.memory_allocation_data   = []  #   memory allocations for every matrix multiplication operation
        # ------------------------------
        # Register allocation subroutine
        temp_statistics = self.most_used_temps(ir)   # produces a list of register statistics objects
        self.register_allocation_and_memory_alloc_init(temp_statistics, register_names)
        self.memory_allocation(
            self.compute_embeddings(
                self.compute_signals_distance_matrix(
                    self.compute_signals(ir)
                )
            )
        )
    ########################################################################################
    # Register allocation
    ########################################################################################
    
    def most_used_temps(self, ir):
        # IN   : takes as input an intermediate representation
        # OUT  : produces a list of TemporaryVariableStatistics objects, one for each matrix mult
        statistics_per_block = list()
        
        for ir_instruction in ir:                                                            # iterate over the IR statements
            if(ir_instruction.id == "COMMENT"):                                              # 
                if(ir_instruction.sons[0].id == "START"):
                    statistics_per_block.append(TemporaryVariablesStatistics())              # i create a temporaryvariablestatistcs
            else:
                unrolled_ir = ir_instruction.flatten()                                       # unroll the statemenet
                temps_in_statement   = list()                                                # container for temps variables in the current statement
                for u,val in zip(unrolled_ir[:-1],unrolled_ir[1:]):  
                    if u == "TEMP": 
                        temps_in_statement.append(val)                                       
                                                                                             # now "temps_in_statement" contains only the values of the temporary variables
                for t in temps_in_statement:                                                 # count the usage of each temporal 
                    statistics_per_block[-1].increment(t)                                       # the current temporary variable statistics is updated 
        return statistics_per_block
    
    def register_allocation_and_memory_alloc_init(self, statistics_list , register_names):
        # IN  : a statistics list obtained from  most_used_temps , register names
        # OUT : a registerAllocation object
        
        # convert the dictionary to an array
        temp_stats_per_block = list()
        for stat in statistics_list:
            # I transform the dictitonary in a sorted-by-usage vector
            arr = stat.vectorize()
            
            # I initialize a Registerallocation  object
            reg_data = RegisterAllocationData()   
            
            # for every register i take an element, starting from the beginning, of the array
            temp_var_count = 0
            for r in register_names:
                if temp_var_count >= len(arr):
                    break
                reg_data.insert(arr[temp_var_count,0],r)                                      # i add as a used register the temporary variables with more usage
                temp_var_count += 1
            self.register_allocation_data.append(reg_data)                                    # i append the register allocation data obtained to the list of RAD
            
            # I also initialize the "slots" for the memory allocation data
            mem_data = MemoryAllocationData()                                                 
            for t in range(temp_var_count, len(arr)):
                mem_data.insert(arr[t,0], -1)                                                 # initializa with -1
            self.memory_allocation_data.append(mem_data)                                      # i add them to the list
        
        return 0
    
    ########################################################################################
    # Memory allocation
    ########################################################################################

    def compute_signals(self, ir):
        # takes as input an intermediate repr and a register allocation output
        signals_per_block = list()
 
        curr_alloc_block = 0
    
        for ir_instruction in ir:
            if(ir_instruction.id == "COMMENT"):
                if(ir_instruction.sons[0].id == "START"):
                    curr_memory_alloc_block = self.memory_allocation_data[curr_alloc_block]
                    signals_per_block.append(BlockSignals(curr_memory_alloc_block))                       
                    curr_alloc_block += 1

            unrolled_ir = ir_instruction.flatten()
            temps_in_statement   = list()
            for u,val in zip(unrolled_ir[:-1],unrolled_ir[1:]):
                if u == "TEMP":
                    temps_in_statement.append(val)      
                    
            signals_per_block[-1].add_tick(temps_in_statement)
        return signals_per_block
    
    def compute_signals_distance_matrix(self, signals):
        # IN   : takes as input a collection of TempVarSignals
        # OUT  : produces a 
        
        mappings = list()          # mapping between the rows of the matrix and the temp_var
        Ds = list()                # list of matrices
        
        for signals_block in signals:
            D = np.zeros((len(signals_block.get_data()),len(signals_block.get_data())))
            mapping = {}
            for i,a in enumerate(signals_block.get_data()):
                mapping[i] = a
                for j,b in enumerate(signals_block.get_data()):  
                    v_a = np.arange(len(signals_block.get_data()[a]))[np.array(signals_block.get_data()[a]) == 1.]
                    v_b = np.arange(len(signals_block.get_data()[b]))[np.array(signals_block.get_data()[b]) == 1.]
                    distanza = 0.5 * (np.mean([ np.min(np.abs(s_1 - v_b)) for s_1 in v_a]) + np.mean([ np.min(np.abs(s_2 - v_a)) for s_2 in v_b]))
                    D[i,j] = distanza
            Ds.append(D)
            mappings.append(mapping)
        return Ds, mappings

    def compute_embeddings(self, distance_matrices_and_mappings):
        distance_matrices = distance_matrices_and_mappings[0]
        mappings          = distance_matrices_and_mappings[1]
        embeddings = list()
        for mapping,D in zip(mappings,distance_matrices):
            if( len(D) > 0 ):
                embedding = MDS( dissimilarity = 'precomputed',n_components = 1).fit_transform(D)
                X         = np.c_[ np.arange(len(D)), embedding ]             # [row of the distance matrix | embedding of the variable associated to that row ]
                X         = np.array([ [ mapping[x[0]], x[1] ] for x in X])   # [ temp_var | embedding of the temp_var ]
                X         = X[ np.argsort(X[:,1]) ]                           # order them by position
                embeddings.append(X)
            else:
                embeddings.append([])
        return embeddings
    
    def memory_allocation(self, embeddings):
        for m,e in zip(self.memory_allocation_data,embeddings):
            for i,temp in enumerate(e):
                m.insert(int(temp[0]),i)

## Code generator

## Assembler

## Orchestrator function

In [195]:
def compiler(file_name, registers, sparsify = False):
    # debug, we don't actually read a file but we generate it randomly
    rete = Program([5,30,30,1], ["RELU","RELU","LINEAR"])
    rete.random_init_weights()
    intermediate_representation = IR(rete)
    allocator = Allocator(intermediate_representation , registers[2:])
    asm_code = []
    
    #for m in allocator.memory_allocation_data:
    #    m.print()
    #    print("----")
    
    cursor_allocator = 0
   
    for ir_statement in intermediate_representation:
        flattened = ir_statement.flatten()
        #print(flattened)
        if(flattened[0] == "COMMENT"):
            if(flattened[1] == "END"):
                print("\n # NUOVO BLOCCO \n")
                cursor_allocator += 1
        
        if(flattened[0] == "MOVE" 
           and 
           flattened[3] == "CONST"
          ): # set the temporary variable
            tmp_name = flattened[2]
            val       = flattened[4]
            # we have to understand if tmp_name is a register or not
            if(allocator.register_allocation_data[cursor_allocator].contains(tmp_name)):
                print("MOV TO $%d THE VALUE #%f" % (
                      allocator.register_allocation_data[cursor_allocator].get_data()[tmp_name],
                      val)
                     )
            else:
                # variable is in memory
                address = allocator.memory_allocation_data[cursor_allocator].get_data()[tmp_name]
                print("STORE THE VALUE 0 IN ADDRESS %d" % address)
                
                
        if(flattened[0] == "MOVE"
           and
           flattened[3] == "BINOP"
           and 
           len(flattened ) == 13
          ): # addition and multiply
            neuron_dest   = flattened[2]
            neuron_source = flattened[12]
            weight        = flattened[10]
            
            
            if(allocator.register_allocation_data[cursor_allocator].contains(neuron_source)):
                print("MULT $%d BY #%f AND SAVE IT IN $0" % (
                      allocator.register_allocation_data[cursor_allocator].get_data()[neuron_source],
                      weight)
                     )
            else:
                # variable is in memory
                address = allocator.memory_allocation_data[cursor_allocator].get_data()[neuron_source]
                print("LOAD IN REGISTER $0 THE ADDRESS %d"  % allocator.memory_allocation_data[cursor_allocator].get_data()[neuron_source])
                print("MULT $0 BY #%f AND SAVE IT IN $0" % (
                      weight)
                     )
            if(allocator.register_allocation_data[cursor_allocator].contains(neuron_dest)):
                print("ADD $0 to $%d" % (
                      allocator.register_allocation_data[cursor_allocator].get_data()[neuron_dest])
                     )
            else:
                # variable is in memory
                address = allocator.memory_allocation_data[cursor_allocator].get_data()[neuron_dest]
                print("LOAD IN REGISTER $1 THE ADDRESS %d" % address)                
                print("ADD $0 to $1")
                print("STORE $1 IN ADDRESS %d" % address)
    
compiler(0, np.arange(10))

STORE THE VALUE 0 IN ADDRESS 0
STORE THE VALUE 0 IN ADDRESS 1
STORE THE VALUE 0 IN ADDRESS 2
STORE THE VALUE 0 IN ADDRESS 3
STORE THE VALUE 0 IN ADDRESS 4
STORE THE VALUE 0 IN ADDRESS 5
STORE THE VALUE 0 IN ADDRESS 6
STORE THE VALUE 0 IN ADDRESS 7
STORE THE VALUE 0 IN ADDRESS 8
STORE THE VALUE 0 IN ADDRESS 9
MOV TO $9 THE VALUE #0.000000
STORE THE VALUE 0 IN ADDRESS 10
STORE THE VALUE 0 IN ADDRESS 11
STORE THE VALUE 0 IN ADDRESS 12
MOV TO $8 THE VALUE #0.000000
STORE THE VALUE 0 IN ADDRESS 13
STORE THE VALUE 0 IN ADDRESS 14
STORE THE VALUE 0 IN ADDRESS 17
STORE THE VALUE 0 IN ADDRESS 16
STORE THE VALUE 0 IN ADDRESS 18
STORE THE VALUE 0 IN ADDRESS 19
STORE THE VALUE 0 IN ADDRESS 15
STORE THE VALUE 0 IN ADDRESS 21
STORE THE VALUE 0 IN ADDRESS 22
STORE THE VALUE 0 IN ADDRESS 23
STORE THE VALUE 0 IN ADDRESS 24
STORE THE VALUE 0 IN ADDRESS 25
STORE THE VALUE 0 IN ADDRESS 20
MOV TO $7 THE VALUE #0.000000
STORE THE VALUE 0 IN ADDRESS 26
MULT $6 BY #0.247561 AND SAVE IT IN $0
LOAD IN REGISTER 

# Test

## Unicorn emulator

<hr>

In [109]:
1 in {2:3,5:5}

False