# Compiler

## Network container class

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class Program:
    # this class contains in a structured manner matrices and biases
    # which encode the semantics of the program network, in the same fashion
    # of the source code of a program
    def __init__(self, topology , activation_functions_list):
        self.topology = topology
        self.activation_functions_list = activation_functions_list
        self.W = [ 0 for t in topology[:-1]]
        self.b = [ 0 for t in topology[:-1]]
    def random_init_weights(self):
        for layer in range(len(self.topology[:-1])):
            self.W[layer] = np.random.normal( size = (self.topology[layer + 1] , self.topology[layer] ) )**2
            size = self.W[layer].shape[0] * self.W[layer].shape[1] 
            for i in range(size - int(np.sqrt(size))):
                r = np.random.choice(len(self.W[layer]))
                c = np.random.choice(len(self.W[layer].T))
                self.W[layer][r,c] = 0.   # adding some sparsity
                
            self.b[layer] = np.random.normal( size = self.topology[layer + 1] )
    def print(self):
        for i,layer in enumerate(self.topology[:-1]):
            print("layer %d->%d "%(i,i+1))
            print("W.shape = %s \t b.shape = %s" % (str(self.W[i].shape),str(self.b[i].shape)))
    def visualize_weights(self):
        for w in self.W:
            print(w.shape)
            plt.figure(figsize=(10,10))
            plt.imshow(w > 0.)        

## Sparsification algorithm

## IR production

### Lightweight trees representation

In [2]:
class c:
    # every element is a node in the tree.
    # the first argument denotes the name of the node ; the possible next arguments are a list of the sons.
    # Note that nodes without sons are simply leaf (eg. ADD, MUL,...)
    def __init__(self, ID, *args):
        self.id   = ID
        self.sons = list()
        for arg in args:
            self.sons.append(arg)
    def print(self,level = 0):
        print( ("\t" * level) + str(self.id) )
        for s in self.sons:
            s.print(level + 1)
    def __str__(self):
        ret = str(self.id)
        if len(self.sons) > 0:
            ret += "("
            for s in self.sons:
                ret += str(s)
                if s != self.sons[-1]:
                    ret += ','
            ret += ")"
        return ret
    def flatten(self):
        ret = []
        ret += [self.id]
        for s in self.sons:
            ret += s.flatten()
        return ret

### IR production function

In [3]:
def IR(program, compile_time_data = True):
    # for every layer
    ### print("y = input")
    ret = list()
    
    for layer,t in enumerate(program.topology[1:]):
        IR_instruction = c("COMMENT", c("START"))
        ### print(IR_instruction)
        ret.append(IR_instruction)
        
        f = program.activation_functions_list[layer]
        R_offset = t - 1
        for i in range(len(program.b[layer])): # for every row
            #print("R[%d] = 0" % (i))
            #print("MOVE( TEMP(%d) , CONST(0) )" % (i + R_offset))
            IR_instruction = c(
                    "MOVE",
                     c("TEMP" , c(i + R_offset) ),
                     c("CONST", c(0))
            )
            ### print(str(IR_instruction))
            ret.append(IR_instruction)
            
            
        for i in range(len(program.W[layer])): # for every row
            for j in range(len(program.W[layer].T)): # for every column
                if( program.W[layer][i,j] != 0):
                    # if there is zero is useless to compute the contribution
                    #print("->MOVE(TEMP(%d),BINOP(ADD,TEMP(%d),BINOP(MUL,CONST(%f),TEMP(%d)))) " % (i + R_offset,i + R_offset, program.W[layer][i,j],i )) # COMPILE DATA HYPOTHESIS
                    IR_instruction = c(
                        "MOVE",
                        c("TEMP",
                            c(i + R_offset)
                        ),
                        c("BINOP",
                            c("ADD"),
                            c("TEMP",
                                c(i + R_offset)
                            ),
                            c("BINOP",
                                c("MUL"),
                                c("CONST",
                                     c(program.W[layer][i,j])
                                 ),
                                c("TEMP",
                                 c(j)
                                 )
                            )
                        )
                    )
                    #print(IR_instruction)
                    ret.append(IR_instruction)
                else:
                    0
                    ### print("# here there was a 0 so we exploit sparsity ")
                    
        for i in range(len(program.b[layer])): # for every row
            # a priori in compile time since the amount of "repetitions" doens't scale quadratically , in opposite to weights
            #print("->MOVE(TEMP(%d),BINOP(ADD,TEMP(%d),CONST(%f)))" % (i, i + R_offset , program.b[layer][i]) )
            IR_instruction = c("MOVE",
                                c("TEMP",
                                 c(i)
                                 ),
                                c("BINOP",
                                 c("ADD"),
                                 c("TEMP",
                                   c(i + R_offset)
                                  ),
                                 c("CONST",
                                  c(program.b[layer][i])
                                  )
                                 )
                                )
            ### print((IR_instruction))
            ret.append(IR_instruction)
        for i in range(len(program.b[layer])): # for every row
            #print("y[%d] = f(y[%d])" % (i,i))
            #print("MOVE( TEMP(%d) , CALL( %s, TEMP(%d) ) )" % (i,f,i) )
            IR_instruction = c("MOVE",
                c("TEMP",
                     c(i)
                 ),
                c("CALL",
                     c(f),
                     c("TEMP",
                          c(i)
                      )
                 )
            )
            ### print(IR_instruction)
            ret.append(IR_instruction)
        IR_instruction = c("COMMENT", c("END"))
        ### print(IR_instruction)
        ret.append(IR_instruction)
    return ret

## Register and memory allocation

### Temporary variables statistics class

In [4]:
class TemporaryVariablesStatistics:
    def __init__(self):
        self.temp_usage_map = {}
    def increment(self,temp_variable):
        old_value = self.temp_usage_map.get(temp_variable)
        if old_value == None:
            old_value = 0
        self.temp_usage_map[temp_variable] = old_value + 1
    def get_data(self):
        return self.temp_usage_map
    def vectorize(self):
        arr = []
        for s in self.get_data():
            arr.append( [ s, self.get_data()[s]] )
        arr = np.array(arr)                                                              # builds a tempstable [ temp | usage ]
        arr = arr[ arr[:,1].argsort()[-1::-1] ]                                          # sort the tempstable by usage  (decreasing)
        return arr


### Register allocation data class

In [5]:
"sx".startswith("x")

False

In [6]:
class RegisterAllocationData:
    def __init__(self):
        self.temp_reg_map = {}
    
    def get(self,temp_variable):
        return self.temp_reg_map[temp_variable]
    
    def insert(self,temp_variable, register):
        self.temp_reg_map[temp_variable] = register
    def get_data(self):
        return self.temp_reg_map
    
    def rename(self,old_reg_name, new_reg_name):
        for t in self.temp_reg_map:
            if self.temp_reg_map[t] == old_reg_name:
                self.temp_reg_map[t] = new_reg_name
    
    def get_unitialized_temps(self):
        # returns the list of temps that have a register starting with "register_"
        ret = list()
        for s in self.temp_reg_map:
            if self.temp_reg_map[s].startswith("register_"):
                ret.append(s)
        return ret
    
    def get_initialized_registers(self):
        # returns a list of the register ACTUALLY used (no place holder)
        ret = list()
        for s in self.temp_reg_map:
            if not self.temp_reg_map[s].startswith("register_"):
                ret.append(self.temp_reg_map[s])
        return ret
    
    def get_variables_list(self):
        ret = list()
        for s in self.temp_reg_map:
            ret.append(s)
        return ret
    
    def get_input_temps(self, prev_layer_size):
        all_vars = np.array(self.get_variables_list())
        ret      = list()
        for var in all_vars:
            if var < prev_layer_size:
                ret.append([var, self.temp_reg_map[var]])
        return np.array(ret)
    
    def get_output_temps(self, prev_layer_size):
        all_vars = np.array(self.get_variables_list())
        ret      = list()
        for var in all_vars:
            if var >= prev_layer_size:
                ret.append([var, self.temp_reg_map[var]])
        return np.array(ret)
    
    def print(self):
        for t in self.temp_reg_map:
            print(t , "\t", self.temp_reg_map[t] )
    def contains(self,tmp_name):
        return tmp_name in self.temp_reg_map

### Memory allocation data class

In [7]:
class MemoryAllocationData:
    def __init__(self):
        self.temp_mem_map = {}
        
    def insert(self,temp_variable, address):
        self.temp_mem_map[temp_variable] = address
        
        
    def batch_set(self, list_of_temps, list_of_addresses):
        for tmp_id, mem_addr in zip(list_of_temps,list_of_addresses):
            self.temp_mem_map[tmp_id] = mem_addr
        
    def get_data(self):
        return self.temp_mem_map
    def get_variables_list(self):
        ret = list()
        for s in self.temp_mem_map:
            ret.append(s)
        return ret
    
    def get_input_temps(self, prev_layer_size):
        all_vars = np.array(self.get_variables_list())
        return all_vars[all_vars < prev_layer_size]
    
    def get_output_temps(self,prev_layer_size):
        all_vars = np.array(self.get_variables_list())
        return all_vars[all_vars >= prev_layer_size]
    
    def print(self):
        for t in self.temp_mem_map:
            print(t , "\t", self.temp_mem_map[t] )

### Block Signals

In [8]:
class BlockSignals:
    def __init__(self, memory_allocation_object):
        # initialize an empty dictionary starting from the variables name
        self.memory_allocation_object = memory_allocation_object
        self.temp_signals_map = {}
        for t in memory_allocation_object.get_data():
            self.temp_signals_map[t] = []
        
    def add_tick(self, temp_variables):
        temp_variables          = np.intersect1d(temp_variables, self.memory_allocation_object.get_variables_list())
        all_temporary_variables = self.memory_allocation_object.get_variables_list()
        # push 0 in the lists of unused temps and 1 in the list of the used temp
        for t in all_temporary_variables:
            self.temp_signals_map[t].append(0)
        for t in temp_variables:
            self.temp_signals_map[t][-1] = 1.
            
    def get_data(self):
        return self.temp_signals_map

### Interface Communication between blocks

In [9]:
class MemoryToRegisterFlow:
    # contains the information about the movement of information from a memory
    # cell to a register from a matrix mult to the next one
    def __init__(self, mem_address, register):
        self.mem_address = mem_address
        self.register    = register
    def print(self):
        print("M2R flow\t%s\t->\t%s" % (self.mem_address,self.register))
class RegisterToMemoryFlow:
    # contains the information about the movement of information from a register 
    # to a memory cell from a matrix mult to the next one
    def __init__(self, register, mem_address):
        self.mem_address = mem_address
        self.register    = register
    def print(self):
        print("R2M flow\t%s\t->\t%s" % (self.register,self.mem_address))

class RegisterRenameFlow:
    # contains the information about the movement of information from a register 
    # to a memory cell from a matrix mult to the next one
    def __init__(self, register_placeholder, register_name):
        self.register_placeholder = register_placeholder
        self.register_name        = register_name
    def print(self):
        print("R2R flow\t%s\t->\t%s" % (self.register_placeholder,self.register_name))

        
class InterfaceCommunication:
    # contains the list of movements "flows" between two matrix mult blocks
    def __init__(self):
        self.reg2memFlows = list()
        self.mem2regFlows = list()
        self.reg2regFlows = list()
    def insert(self,flow):
        if type(flow).__name__ == "RegisterToMemoryFlow":
            self.reg2memFlows.append(flow)
        else:
            if type(flow).__name__ == "MemoryToRegisterFlow":
                self.mem2regFlows.append(flow)
            else:
                if type(flow).__name__ == "RegisterRenameFlow":
                    self.reg2regFlows.append(flow)
                    
    def getReg2MemFlows(self):
        return self.reg2memFlows
    def getMem2RegFlows(self):
        return self.mem2regFlows
    def getReg2RegFlows(self):
        return self.reg2regFlows
    
    def print(self):
        reg2mem = self.getMem2RegFlows()
        mem2reg = self.getReg2MemFlows()
        reg2reg = self.getReg2RegFlows()
        for rm in reg2mem:
            rm.print()
        for mr in mem2reg:
            mr.print()
        for rr in reg2reg:
            rr.print()
        
# interface[layer_1,layer_2] = InterfaceCommunication()
# interface[layer_1,layer_2].insert( RegisterToInputFlow('AX', 0x1234))
# interface[layer_1,layer_2].insert( RegisterToInputFlow('BX', 0x1235))
# interface[layer_1,layer_2].insert( RegisterToInputFlow(0x1236, 'AX'))

A = InterfaceCommunication()
A.insert( RegisterToMemoryFlow("AX",0) )
A.insert( RegisterToMemoryFlow("BX",1) )
A.insert( MemoryToRegisterFlow(2, "AX") )
A.insert( RegisterRenameFlow("register_0", "AX") )
A.print()

M2R flow	2	->	AX
R2M flow	AX	->	0
R2M flow	BX	->	1
R2R flow	register_0	->	AX


### Theoretical argument : memory allocation

Suppose that we have some kind of distance between temporary variables in a <b>matrixmult</b>. <br>
Therefore we have a collection of distance matrices $\{D_{1,2},D_{2,3},...,D_{N-1,N}\}$ <br>
We can formulate the following optimization problem <br><br>
$
    \text{Find $\{P_{1,2},P_{2,3},...,P_{N-1,N}\}$ permutations of 
    $\{ [1,n_{1,2}], [1,n_{2,3}],... ,[1,n_{N-1,N}] \}$ }
$ <br>
$
\text{such that
    $D(P_{i,i+1}) \sim D_{i,i+1} \ \ \ \forall i$
}
$ <br>
$
\text{Subject to
    $ (P_{i,i+1})_{\text{output (only memory)}} = (P_{i+1,i+2})_{\text{input (only memory)}}  \ \ \forall i$
}
$

### Proposed algorithm

$
\text{$P_0 \leftarrow$ Permutation$(Temp_{\mathcal l,\mathcal l + 1})$ optimal}\\
\text{${\bf for } \ \ \ i,(layer,layer+1) \in temps $ : } \\
\hspace{2em} \text{$ P_i \leftarrow $ Permutation$(Temp_{\mathcal l_i,\mathcal l_{i+1}})$ optimal subject to $P_i^{input} = P_{i-1}^{output}$ }
$

### Constrained Permutation Class

In [10]:
class constrainedPermutation:
    def __init__(self, constraint, trivial_init = True, init_vector = None):
        # constraint = vector of -1, with the exception of the "fixed points" of the permutations
        #              e.g.   -1 -1 4 3 -1
        #              reads as "all the permutations of 01234" such that the third element is 4 and the fourth is 3
        self.constraint = constraint
        if trivial_init == True:
            self.value      = np.arange(len(constraint))
            for i in np.arange(len(constraint))[constraint != -1]:
                # find the value at the position i
                to_swap_A = i                                      # position 1
                to_swap_B = np.argmax(self.value == constraint[i]) # position 2
                tmp       = self.value[to_swap_A]
                self.value[to_swap_A] = self.value[to_swap_B]
                self.value[to_swap_B] = tmp
        else:
            self.value = init_vector.copy()
    
    def get_neighbor(self):
        feasible = np.arange(len(self.constraint))[self.constraint == -1]
        old = self.value[feasible]
        a        = np.random.choice(len(feasible))
        b        = np.random.choice(len(feasible))
        temp     = feasible[a]
        feasible[a] = feasible[b]
        feasible[b] = temp
        ret = self.value.copy()
        ret[feasible] = old
        return constrainedPermutation(self.constraint,False,ret)
    
    def get_indexes(self):
        return self.value
    
    def distance_matrix(self):
        return np.array([
            [
                    np.linalg.norm(a - b)
                for b in self.value
            ]
            for a in self.value
        ])
    
    def print(self):
        print(self.value)
ret = list()
for i in range(10000):
    ret.append( constrainedPermutation(np.array([-1.,-1.,2,3,4,-1,-1,-1])).get_neighbor().value )
ret = np.array(ret)
ret.mean(axis = 0)

array([1.5513, 2.1007, 2.    , 3.    , 4.    , 4.5185, 5.1325, 5.697 ])

In [11]:
def affinity(permutation, distance_matrix):
    # compute the permutation distance matrix
    d_hat = permutation.distance_matrix()
    return np.linalg.norm( d_hat - distance_matrix.argsort(axis = 1) )

### Allocator class

In [12]:
def compute_temporary_mapping_table_and_table_inv(temps_input, temps_input_name_at_previous_block):
    temporary_mapping_table =  {}
    temporary_mapping_table_inv = {}
    for a,b in zip(temps_input,temps_input_name_at_previous_block):
        temporary_mapping_table[b] = a
        temporary_mapping_table_inv[a] = b   
    return temporary_mapping_table,temporary_mapping_table_inv

In [13]:
def compute_output_prev_block_register_mapping(temps_in_reg_output, regs_output):
    print("OUTPUT TEMPORARIES IN PREVIOUS LAYER IN REGISTERS")
    output_prev_block_register_mapping = {}
    print("output_{i-1}\tREG")
    for tro in np.c_[temps_in_reg_output, regs_output]:
        print("%s\t\t%s" % (tro[0],tro[1]))
        output_prev_block_register_mapping[int(tro[0])] = tro[1]
    return output_prev_block_register_mapping

In [14]:
def print_table(temps_in_mem_input, 
                mems_input,
                temps_in_mem_output,
                mems_output,
                temps_in_reg_input,
                regs_input,
                temps_in_reg_output,
                regs_output,
                i
               ):
                print("### [%d] INPUT TEMPS MEMORY TABLE ###" % 0)
                for r in np.c_[temps_in_mem_input, mems_input]:
                    print("%d\t%d" % (r[0],r[1]))
                print("### [%d] OUTPUT TEMPS MEMORY TABLE ###" % 0)
                for r in np.c_[temps_in_mem_output, mems_output]:
                    print("%d\t%d" % (r[0],r[1]))
                print("### [%d] INPUT TEMPS REGISTER TABLE ###" % 0)
                for r in np.c_[temps_in_reg_input, regs_input]:
                    print("%s\t%s" % (r[0],r[1]))
                print("### [%d] OUTPUT TEMPS REGISTER TABLE ###" % 0)
                for r in np.c_[temps_in_reg_output, regs_output]:
                    print("%s\t%s" % (r[0],r[1]))
            

In [17]:
from sklearn.manifold import MDS
class Allocator:
    def __init__(self, ir, program, register_names):
        self.register_allocation_data = []  # register allocations for every matrix multiplication operation
        self.memory_allocation_data   = []  #   memory allocations for every matrix multiplication operation
        self.interfaces               = {}
        self.register_names           = register_names
        self.program                  = program
        # ------------------------------
        # Register allocation subroutine
        temp_statistics = self.most_used_temps(ir)   # produces a list of register statistics objects
        self.register_allocation_and_memory_alloc_init(temp_statistics, register_names)
        self.memory_allocation(
                self.compute_signals_distance_matrix(
                    self.compute_signals(ir)
                )
        )
    ########################################################################################
    # Register allocation
    ########################################################################################
    
    def most_used_temps(self, ir):
        # IN   : takes as input an intermediate representation
        # OUT  : produces a list of TemporaryVariableStatistics objects, one for each matrix mult
        statistics_per_block = list()
        
        for ir_instruction in ir:                                                            # iterate over the IR statements
            if(ir_instruction.id == "COMMENT"):                                              # 
                if(ir_instruction.sons[0].id == "START"):
                    statistics_per_block.append(TemporaryVariablesStatistics())              # i create a temporaryvariablestatistcs
            else:
                unrolled_ir = ir_instruction.flatten()                                       # unroll the statemenet
                temps_in_statement   = list()                                                # container for temps variables in the current statement
                for u,val in zip(unrolled_ir[:-1],unrolled_ir[1:]):  
                    if u == "TEMP": 
                        temps_in_statement.append(val)                                       
                                                                                             # now "temps_in_statement" contains only the values of the temporary variables
                for t in temps_in_statement:                                                 # count the usage of each temporal 
                    statistics_per_block[-1].increment(t)                                       # the current temporary variable statistics is updated 
        return statistics_per_block
    
    def register_allocation_and_memory_alloc_init(self, statistics_list , register_names):
        # IN  : a statistics list obtained from  most_used_temps , register names
        # OUT : a registerAllocation object
        
        # convert the dictionary to an array
        temp_stats_per_block = list()
        first_block = True
        for stat in statistics_list:
            # I transform the dictitonary in a sorted-by-usage vector
            arr = stat.vectorize()
            
            # I initialize a Registerallocation  object
            reg_data = RegisterAllocationData()   
            
            # for every register i take an element, starting from the beginning, of the array
            temp_var_count = 0
            for r_id,r in enumerate(register_names):
                if temp_var_count >= len(arr):
                    break
                if first_block:     # registers are decided a priori only in the first block
                    reg_data.insert(arr[temp_var_count,0],r)                                      # i add as a used register the temporary variables with more usage
                else:
                    reg_data.insert(arr[temp_var_count,0],"register_%d" % r_id)
                temp_var_count += 1
            self.register_allocation_data.append(reg_data)                                    # i append the register allocation data obtained to the list of RAD
            
            # I also initialize the "slots" for the memory allocation data
            mem_data = MemoryAllocationData()                                                 
            for t in range(temp_var_count, len(arr)):
                mem_data.insert(arr[t,0], -1)                                                 # initializa with -1
            self.memory_allocation_data.append(mem_data)                                      # i add them to the list
            first_block = False
        return 0
    
    ########################################################################################
    # Memory allocation
    ########################################################################################

    def compute_signals(self, ir):
        # takes as input an intermediate repr and a register allocation output
        signals_per_block = list()
 
        curr_alloc_block = 0
    
        for ir_instruction in ir:
            if(ir_instruction.id == "COMMENT"):
                if(ir_instruction.sons[0].id == "START"):
                    curr_memory_alloc_block = self.memory_allocation_data[curr_alloc_block]
                    signals_per_block.append(BlockSignals(curr_memory_alloc_block))                       
                    curr_alloc_block += 1

            unrolled_ir = ir_instruction.flatten()
            temps_in_statement   = list()
            for u,val in zip(unrolled_ir[:-1],unrolled_ir[1:]):
                if u == "TEMP":
                    temps_in_statement.append(val)      
                    
            signals_per_block[-1].add_tick(temps_in_statement)
        return signals_per_block
    
    def compute_signals_distance_matrix(self, signals):
        # IN   : takes as input a collection of TempVarSignals
        # OUT  : produces a 
        
        mappings = list()          # mapping between the rows of the matrix and the temp_var
        Ds = list()                # list of matrices
        
        for signals_block in signals:
            D = np.zeros((len(signals_block.get_data()),len(signals_block.get_data())))
            mapping = {}
            for i,a in enumerate(signals_block.get_data()):
                mapping[i] = a
                for j,b in enumerate(signals_block.get_data()):  
                    v_a = np.arange(len(signals_block.get_data()[a]))[np.array(signals_block.get_data()[a]) == 1.]
                    v_b = np.arange(len(signals_block.get_data()[b]))[np.array(signals_block.get_data()[b]) == 1.]
                    distanza = 0.5 * (np.mean([ np.min(np.abs(s_1 - v_b)) for s_1 in v_a]) + np.mean([ np.min(np.abs(s_2 - v_a)) for s_2 in v_b]))
                    D[i,j] = distanza
            Ds.append(D)
            mappings.append(mapping)
        return Ds, mappings
    
    def anneal(self,constrainedperm, Ds, mapping):
        # Given an initial guess permutation and params of the signals in the current block
        # computes the most "affine" permutations of temporary variables in memory
        
        # INPUT  : initial guess, signals data
        # OUTPUT : an optimized guess for the mapping between temporary variables and addresses
        
        # stupid stub function
        ret = constrainedPermutation(
            constrainedperm.constraint,
            False,
            constrainedperm.value
        )
        
        for i in range(100):
            ret = ret.get_neighbor()
            
        return ret
    
    
    
    def density_optimizer_memory_subset_for_output(self,memory,constraint_vector, input_size, output_size):
        # Given a memory object, a constraint vector , the size of the input and the output size
        # computes the optimal position in memory of the temporary variables in order to maximize
        # the density of the vector in memory, and therefore the locality
        
        # INPUT  : see above
        # OUTPUT : an initial guess for the annealer
        
        memory_mask = np.arange(len(memory))[ [ not( x in constraint_vector) for x in np.arange(len(memory))] ]
         
        rows = list()
        densities = list()
        
        for j in range( (len(memory) - input_size) - output_size + 1):
            row = np.zeros(len(memory))
            
            row[ constraint_vector[constraint_vector != -1].astype(int)] = -1.
            row[memory_mask[j:j+output_size]] = 1.
            density = lambda r : (r != 0 )[ (r != 0).argmax() : (len(r) - (r != 0)[-1::-1].argmax())].mean()

            rows.append(row)
            densities.append(density(row))
            
        densities = np.array(densities)
        rows      = np.array(rows)
        
        # choose the best
        configuration = rows[densities.argmax()]
        
        print(configuration)
        
        # set the output starting from the constraint vector
        perm_vector = constraint_vector.copy()
        print(perm_vector)
        perm_vector[np.arange(len(perm_vector))[perm_vector == -1]] = np.arange(len(memory))[configuration == 1]
        
        return perm_vector

####################################################################################################################
#####                  #############################################################################################
#####   huge function  #############################################################################################
#####                  #############################################################################################
####################################################################################################################

    def memory_allocation(self, DS_MAPPINGS):
        # I load the data about the signals to perform annealing
        Ds       = DS_MAPPINGS[0]
        mappings = DS_MAPPINGS[1]
        
        
        # i define a fake memory just for debug
        memory = np.arange(
            np.max(
                [
                       np.max(len(m.get_variables_list()))
                    for m in self.memory_allocation_data 
                ]
            )
        )
        
        
        # D, mapping for the first block
        D       = Ds[0]
        mapping = mappings[0]
        
        
        # load the temps in the first matrix mult
        temps_in_mem = self.                     \
                       memory_allocation_data[0].\
                       get_variables_list()      # list of all temporary variables in memory at first block
        
        # optimize the map between the temps in memory and the first addresses of memory
        permutation = constrainedPermutation(np.ones(len(temps_in_mem))  * -1)
        permutation = self.anneal(permutation, D, mapping)
                                                                        #   Visualization of the rapresentation of the map to optimize
            
                                                                        #   Temps           Allocated       Memory
                                                                        #    _                _
                                                                        #   |_|------------->|_|--------------|_| 
                                                                        #   |_|----+   +---->|_|------+
                                                                        #   |_|----|---+ +-->|_|---+  |
                                                                        #   |_|--+ +-----|-->|_|-+ |  +------>|_|
                                                                        #        +-------+       | +--------->|_|
                                                                        #     \____________/     |
                                                                        #           |            +----------->|_|
                                                                        #      represented by
                                                                        #      a permutation
                                                        
        
        # defining the first table
        temps_in_mem_input  =  self.\
                               memory_allocation_data[0].\
                               get_input_temps(self.program.topology[0])     # list of all temporary INPUT variables in memory at first block
        
        temps_in_mem_output =  self.\
                               memory_allocation_data[0].\
                               get_output_temps(self.program.topology[0])    # list of all temporary OUTPUT variables in memory at first block
        
        mems_input          =  permutation.value[ : len(temps_in_mem_input)] # subset of the map referred to the input variables
        mems_output         =  permutation.value[ len(temps_in_mem_input) :] # subset of the map referred to the output variables
        
        if( len(self.register_allocation_data[0].get_input_temps(self.program.topology[0])) > 0 ):
            temps_in_reg_input  =  self.register_allocation_data[0].get_input_temps(self.program.topology[0])[:,0]
            regs_input          =  self.register_allocation_data[0].get_input_temps(self.program.topology[0])[:,1]
        else:
            temps_in_reg_input  = np.array([])
            regs_input          = np.array([])
        
        if(len(self.register_allocation_data[0].get_output_temps(self.program.topology[0])) > 0):
            temps_in_reg_output =  self.register_allocation_data[0].get_output_temps(self.program.topology[0])[:,0]
            regs_output         =  self.register_allocation_data[0].get_output_temps(self.program.topology[0])[:,1]
        else:
            temps_in_reg_output = np.array([])
            regs_output         = np.array([])
        
                                                                        #  table for binding temporary variables to mem addresses
            
                                                                        #     TEMP_IN_MEM    MEMS
                                                                        #   +------------+-----------+  ---
                                                                        #   |  tmp_0     |           |     \
                                                                        #   |  tmp_...   |           |     |--- input
                                                                        #   |  tmp_M     |           |     | 
                                                                        #   +------------+-----------+  __/
                                                                        #   |  tmp_M+1   |           |    \
                                                                        #   |  ...       |           |     |---  output
                                                                        #   +------------+-----------+  __|
                                                                        

            
        #################################################################################################################    
        
        # print the first table
        print_table( 
                temps_in_mem_input,                                    # temporary variable in the input stored in memory
                mems_input,                                            # addresses of the previous
                temps_in_mem_output,                                   # temporary variable in the output stored in memory
                mems_output,                                           # addresses of the previous
                temps_in_reg_input,                                    # temporary variable in the input stored in registers
                regs_input,                                            # register names of the previous
                temps_in_reg_output,                                   # temporary variable in the output stored in registers
                regs_output,                                           # register names of the previous
                0
        )
        print("\n\n\n------------------------------------\n\n\n")
        
        
        #################################################################################################################    
      
        for i in range(1, len(self.program.topology) - 1):
            
            # IDs of temporaries for the input
            temps_input         =  np.arange(self.program.topology[i])
            
            # list of IDs of temporaries for the input IN MEMORY
            temps_in_mem_input  =  self.memory_allocation_data[i].get_input_temps(self.program.topology[i])
            
            # list of IDs of temporaries for the input IN REGISTERS
            temps_in_reg_input  =  self.register_allocation_data[i].get_input_temps(self.program.topology[i])
            
            # Initialize the interface between the current block and the previous one
            self.interfaces[i-1,i] = InterfaceCommunication()
           
            # i get the temp_ids of temps_in_mem_input at the previous matrix multiplication
                
                                                #      number of temps in previous block related to input
                                                #   ______|_________________
                                                # /                         \
            temps_input_name_at_previous_block = self.program.topology[i - 1] +  temps_input 
            #\______________________________/                                  # \_________/
            #             |                                                    #      |
            #      name of the curring                                         #    temps in current block related to input
            #      input temps when they existed
            #      in the previous block
            
            
            ###################################################################################################################
            # Building Tables
            ###################################################################################################################

            
            # MAPPING OF TEMPORARY OUTPUTS IN PREVIOUS BLOCK TO TEMPORARY INPUT IN CURRENT ONE
            temporary_mapping_table, temporary_mapping_table_inv  = compute_temporary_mapping_table_and_table_inv(
                temps_input, 
                temps_input_name_at_previous_block
            )      
            
            # MAPPING OF TEMPORARY OUTPUTS IN PREVIOUS BLOCK TO REGISTERS
            output_prev_block_register_mapping = compute_output_prev_block_register_mapping(
                temps_in_reg_output, 
                regs_output
            )
           
            
            print("OUTPUT TEMPORARIES IN PREVIOUS LAYER IN MEMORY")
            print("output_{i-1}\tADDRESS")
            output_prev_block_memory_mapping = {}
            for tro in np.c_[temps_in_mem_output, mems_output]:
                print("%s\t\t%s" % (tro[0],tro[1]))
                output_prev_block_memory_mapping[tro[0]] = tro[1]
            print(output_prev_block_memory_mapping)
            print("")
            
            print("INPUT TEMPORARIES IN CURRENT BLOCK IN MEMORY AND THEIR RENAMING IN PREVIOUS BLOCK")
            print("input_{i}\toutput_{i-1}")
            input_in_memory_curr_block_output_prev_block_mapping = {}
            for timi in temps_in_mem_input:
                print("%s\t\t%s"%(timi,temporary_mapping_table_inv[timi]))
                input_in_memory_curr_block_output_prev_block_mapping[timi] = temporary_mapping_table_inv[timi]
            print("")

            
            print("INPUT TEMPORARIES IN CURRENT BLOCK IN REGISTERS,THEIR REGISTERS AND THEIR RENAMING IN PREVIOUS BLOCK")
            print("input_{i}\tREG\t\t\toutput_{i-1}")
            input_in_registers_curr_block_output_prev_block_mapping = {}
            for timi in temps_in_reg_input:
                print("%s\t\t%s\t\t%s"%(timi[0],timi[1],temporary_mapping_table_inv[int(timi[0])]))
                input_in_registers_curr_block_output_prev_block_mapping[timi[0]] = ( timi[1],temporary_mapping_table_inv[int(timi[0])])
      
            ###################################################################################################################
            # Interface Flows
            ###################################################################################################################

            print("\n\n\n------------------------------------\n\n\n")
                
            print("WE HAVE 2 KIND OF MOVEMENTS + REGISTER_ASSIGNMENT:")
            print("\t\t MEM TO REG ")
            # we find the intersection between OUTPUTS_IN_MEMORY_PREVIOUS_BLOCK and INPUTS_IN_REGISTER_CURRENT_BLOCK
            print("\t\t REG TO MEM ")
            print("\t\t REG TO REG (replacing the place holders with actual registers keeping count of previous block)")

            print("")
            
            print("MEMORY TO REGISTERS")
            print("build the table of memory addresses and registers where they should be moved")
            
            print("# IReg,Regs <- find input temps which are in registers")
            Ireg = list()
            Oreg = list()
            Regs = list()
            for ireg in input_in_registers_curr_block_output_prev_block_mapping:
                Ireg.append(ireg)
                Oreg.append(input_in_registers_curr_block_output_prev_block_mapping[ireg][1])
                Regs.append(input_in_registers_curr_block_output_prev_block_mapping[ireg][0])
            print("# Oreg_mem <- select the ones that are in memory")
            for reg,oreg in zip(Regs,Oreg):
                if oreg in output_prev_block_memory_mapping:
                    print(oreg,reg)
                    self.interfaces[i-1,i].insert( 
                                MemoryToRegisterFlow(
                                           output_prev_block_memory_mapping[oreg], reg 
                                ) 
                    )
            # for each o,r in Oreg_mem,Regs
            #      flows.insert( REGTOMEMORYMOVEMENT o,r )
            
            
            print("")
            
            print("REGISTER TO MEMORY")
            print("build the table of REGISTERS and EMPTY MEMORY ADDRESSES where they should be moved")
           
            Ireg = list()
            Oreg = list()
            
            for ireg in input_in_memory_curr_block_output_prev_block_mapping:
                Ireg.append(ireg)
                Oreg.append(input_in_memory_curr_block_output_prev_block_mapping[ireg])
            
            for reg,oreg in zip(Ireg,Oreg):
                print(oreg, oreg in output_prev_block_register_mapping )
                if oreg in output_prev_block_register_mapping:
                    self.interfaces[i-1,i].insert( 
                                RegisterToMemoryFlow(
                                           output_prev_block_register_mapping[oreg], oreg
                                ) 
                    )
            
            print("REGISTER TO REGISTER")
            print("build the table of REGISTER_PLACEHOLDER AND REGISTERS")
           
            print("# IReg,Regs <- find input temps which are in registers")
            Ireg = list()
            Oreg = list()
            Regs = list()
            for ireg in input_in_registers_curr_block_output_prev_block_mapping:
                Ireg.append(ireg)
                Oreg.append(input_in_registers_curr_block_output_prev_block_mapping[ireg][1])
                Regs.append(input_in_registers_curr_block_output_prev_block_mapping[ireg][0])
            
            for ireg,oreg in zip(Ireg,Oreg):
                print("$$$",oreg)
                if oreg in output_prev_block_register_mapping:
                    self.interfaces[i-1,i].insert( 
                                RegisterRenameFlow(
                                           input_in_registers_curr_block_output_prev_block_mapping[ireg][0],
                                           output_prev_block_register_mapping[oreg]
                                ) 
                    )
            
            self.interfaces[i-1,i].print()
            

            print("Use the extracted information from the interface communication")
            print("impose the registers for REGISTER-REGISTER boundary condition")
            old_name_reg_new_name_reg_dict = {}
            for rr in self.interfaces[i-1,i].getReg2RegFlows():
                old_name = rr.register_placeholder
                new_name = rr.register_name
                # change it in the RegisterAllocationData
                self.register_allocation_data[i].rename(old_name, new_name)
                # save the mapping between old name and new name to modify the flows
                old_name_reg_new_name_reg_dict[old_name] = new_name 
                
            print("assign sequentially other registers")
            
            registri_liberi = np.setdiff1d( 
                                   self.register_names,
                                   self.register_allocation_data[i].get_initialized_registers()                                   
                              )
            for temp,freereg in zip( 
                            self.register_allocation_data[i].get_unitialized_temps(),
                            registri_liberi
            ):
                # i save the placeholder name for the register
                register_placeholder_temp = self.register_allocation_data[i].get(temp)
                # update the register_allocation_data object
                self.register_allocation_data[i].insert(temp, freereg)
                # save the mapping between the placeholder name and the real register
                old_name_reg_new_name_reg_dict[register_placeholder_temp] = freereg
            
            # update the memoryToRegister flows
            print(old_name_reg_new_name_reg_dict)
            for mr in self.interfaces[i-1,i].getMem2RegFlows():
                mr.register = old_name_reg_new_name_reg_dict[mr.register]
            
            self.interfaces[i-1,i].print()
            print("------")
            self.register_allocation_data[i].print()
            return 
        
        
        
            
            mems_input         =  np.zeros(len(temps_in_mem_input))
            # map the previous output in memory to current input in memory
            #      
            
            mems_input          =  mems_output.copy()
            temps_in_mem_output =  self.memory_allocation_data[i].get_output_temps(self.program.topology[i])
            mems_output         =  np.ones(len(temps_in_mem_output)) *-1
            
            print(len(temps_in_mem_input), len(mems_input), len(temps_in_mem_output), len(mems_output))
            # print the first table
            print("### [%d] INPUT TEMPS TABLE ###" % i)
            for r in np.c_[temps_in_mem_input, mems_input]:
                print("%d\t%d" % (r[0],r[1]))
            print("### [%d] OUTPUT TEMPS TABLE ###" % i)
            for r in np.c_[temps_in_mem_output, mems_output]:
                print("%d\t%d" % (r[0],r[1]))
            
            
            print(len(temps_in_mem_input),len(temps_in_mem_output))
            
            # define the constraint vector
            constraint_vector   = np.r_[mems_input, mems_output]
            
            # compute the density-wise optimal position of the mapping of the whole output subset of temps
            permutation_vector  = self.density_optimizer_memory_subset_for_output(memory,
                                                                             constraint_vector,
                                                                             len(temps_in_mem_input),
                                                                              len(temps_in_mem_output)
                                                                            )
            print(permutation_vector)
            permutation         = constrainedPermutation(
                    constraint_vector,
                    False,
                    permutation_vector
            )

            
compiler(0, ['a','b','c','d','e']) # QUESTA ROBA NON FUNZIONA TOGLIERE E' SOLO PER FARE DEBUG ORA
            

### [0] INPUT TEMPS MEMORY TABLE ###
1	5
0	4
### [0] OUTPUT TEMPS MEMORY TABLE ###
12	2
13	1
14	14
10	10
11	7
18	3
17	15
7	9
3	0
4	13
5	8
6	12
8	6
15	11
### [0] INPUT TEMPS REGISTER TABLE ###
2	c
### [0] OUTPUT TEMPS REGISTER TABLE ###
9	d
16	e



------------------------------------



OUTPUT TEMPORARIES IN PREVIOUS LAYER IN REGISTERS
output_{i-1}	REG
9		d
16		e
OUTPUT TEMPORARIES IN PREVIOUS LAYER IN MEMORY
output_{i-1}	ADDRESS
12		2
13		1
14		14
10		10
11		7
18		3
17		15
7		9
3		0
4		13
5		8
6		12
8		6
15		11
{12: 2, 13: 1, 14: 14, 10: 10, 11: 7, 18: 3, 17: 15, 7: 9, 3: 0, 4: 13, 5: 8, 6: 12, 8: 6, 15: 11}

INPUT TEMPORARIES IN CURRENT BLOCK IN MEMORY AND THEIR RENAMING IN PREVIOUS BLOCK
input_{i}	output_{i-1}
7		10
5		8
1		4
2		5
6		9
0		3
4		7
3		6
8		11

INPUT TEMPORARIES IN CURRENT BLOCK IN REGISTERS,THEIR REGISTERS AND THEIR RENAMING IN PREVIOUS BLOCK
input_{i}	REG			output_{i-1}
9		register_0		12



------------------------------------



WE HAVE 2 KIND OF MOVEMENTS + REGISTER

## Code generator

## Assembler

## Orchestrator function

In [16]:
def compiler(file_name, registers, sparsify = False):
    # debug, we don't actually read a file but we generate it randomly
    rete = Program([3,10,10,1], ["RELU","RELU","LINEAR"])
    rete.random_init_weights()
    intermediate_representation = IR(rete)
    #for i in intermediate_representation:
    #    print(i)
    allocator = Allocator(intermediate_representation , rete , registers[2:])
    asm_code = []
    
    #for m in allocator.memory_allocation_data:
    #    m.print()
    #    print("----")
    
    cursor_allocator = 0
   
    for ir_statement in intermediate_representation:
        flattened = ir_statement.flatten()
        #print(flattened)
        if(flattened[0] == "COMMENT"):
            if(flattened[1] == "END"):
                #print("\n # NUOVO BLOCCO \n")
                cursor_allocator += 1
        
        if(flattened[0] == "MOVE" 
           and 
           flattened[3] == "CONST"
          ): # set the temporary variable
            tmp_name = flattened[2]
            val       = flattened[4]
            # we have to understand if tmp_name is a register or not
            if(allocator.register_allocation_data[cursor_allocator].contains(tmp_name)):
                0
                #print("MOV TO $%d THE VALUE #%f" % (
                #      allocator.register_allocation_data[cursor_allocator].get_data()[tmp_name],
                #      val)
                #     )
            else:
                # variable is in memory
                address = allocator.memory_allocation_data[cursor_allocator].get_data()[tmp_name]
                #print("STORE THE VALUE 0 IN ADDRESS %d" % address)
                
                
        if(flattened[0] == "MOVE"
           and
           flattened[3] == "BINOP"
           and 
           len(flattened ) == 13
          ): # addition and multiply
            neuron_dest   = flattened[2]
            neuron_source = flattened[12]
            weight        = flattened[10]
            
            
            if(allocator.register_allocation_data[cursor_allocator].contains(neuron_source)):
                0
                #print("MULT $%d BY #%f AND SAVE IT IN $0" % (
                #      allocator.register_allocation_data[cursor_allocator].get_data()[neuron_source],
                #      weight)
                #     )
            else:
                # variable is in memory
                address = allocator.memory_allocation_data[cursor_allocator].get_data()[neuron_source]
                #print("LOAD IN REGISTER $0 THE ADDRESS %d"  % allocator.memory_allocation_data[cursor_allocator].get_data()[neuron_source])
                #print("MULT $0 BY #%f AND SAVE IT IN $0" % (
                #      weight)
                #     )
            if(allocator.register_allocation_data[cursor_allocator].contains(neuron_dest)):
                0
                #print("ADD $0 to $%d" % (
                #      allocator.register_allocation_data[cursor_allocator].get_data()[neuron_dest])
                #     )
            else:
                # variable is in memory
                address = allocator.memory_allocation_data[cursor_allocator].get_data()[neuron_dest]
                #print("LOAD IN REGISTER $1 THE ADDRESS %d" % address)                
                #print("ADD $0 to $1")
                #print("STORE $1 IN ADDRESS %d" % address)
    
compiler(0, ['a','b','c','d','e'])

### [0] INPUT TEMPS MEMORY TABLE ###
2	5
1	9
### [0] OUTPUT TEMPS MEMORY TABLE ###
15	7
16	8
18	14
14	6
13	3
12	10
11	12
10	15
3	11
7	2
4	1
5	0
6	4
8	13
### [0] INPUT TEMPS REGISTER TABLE ###
0	c
### [0] OUTPUT TEMPS REGISTER TABLE ###
9	d
17	e



------------------------------------



OUTPUT TEMPORARIES IN PREVIOUS LAYER IN REGISTERS
output_{i-1}	REG
9		d
17		e
OUTPUT TEMPORARIES IN PREVIOUS LAYER IN MEMORY
output_{i-1}	ADDRESS
15		7
16		8
18		14
14		6
13		3
12		10
11		12
10		15
3		11
7		2
4		1
5		0
6		4
8		13
{15: 7, 16: 8, 18: 14, 14: 6, 13: 3, 12: 10, 11: 12, 10: 15, 3: 11, 7: 2, 4: 1, 5: 0, 6: 4, 8: 13}

INPUT TEMPORARIES IN CURRENT BLOCK IN MEMORY AND THEIR RENAMING IN PREVIOUS BLOCK
input_{i}	output_{i-1}
4		7
2		5
0		3
1		4
5		8
8		11
6		9
7		10
3		6

INPUT TEMPORARIES IN CURRENT BLOCK IN REGISTERS,THEIR REGISTERS AND THEIR RENAMING IN PREVIOUS BLOCK
input_{i}	REG			output_{i-1}
9		register_1		12



------------------------------------



WE HAVE 2 KIND OF MOVEMENTS + REGISTER

# Test

## Unicorn emulator

<hr>

In [72]:
1 in {1:3,5:5}

True