# Monte carlo simulation for NAEs hash collision

### Simulate the simultaneous collision using only the IP and Port NAEs with the total number of NAEs present in the CICIDS2017 dataset: 8330, 62434 different values for the (source and destination) IP and Port NAEs, respectively. The simulation is done with a maximum hash value of 3000.

## Procedure
### Step 1
Create matrix (draw_matrix) with samples in rows and 8 columns, with the first 4 columns being the first draw (sample) for the source and destination addresses 
 and the second draw for source and destination adresses, the following 4 columns are similar columns for the Port NAE
### Step 2
 Apply a hash function to draw_matrix using the maximum hash value (k). This produces the hashed_matrix.
### Step 3
 Detect a hit (collision) if the first and second draw from the source have the same hash value and at the same time 
 the first and second draw from the destination also have the same hash value. This procedure produce a matrix of three columns for the 
 hits on ip, port addresses. This produces the hit_matrix.
### Step 4
 Calculate the probability of a simultaneous collision/hit in the ip, port and ip&port for the source and destiantion addresses.

In [None]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [None]:
%%R
library(digest)

hex_to_int = function(h) {
  # transform hex to int
  xx = strsplit(tolower(h), "")[[1L]]
  pos = match(xx, c(0L:9L, letters[1L:6L]))
  sum((pos - 1L) * 16^(rev(seq_along(xx) - 1)))
}

w=sample(1:100000, 1)
source_and_dest_draw <- function(num_max) {
  #draw a source and dest number from the range [1-num_max]
  w <<- w+1
  set.seed(w) #This is necessary because the clock time does not chage fast enough
  return(sample(1:num_max,2,replace=F))
}

hash_all <- function(vals) {
  # convert numbers to hash values 
  # the hashed valued are in the range [1-num_max]
  return(sapply(vals,hash_element) )
}


hash_element <- function(val) {
  # convert numbers to hash values 
  # the hashed valued are in the range [1-k]
  return(hex_to_int(digest(val, algo='xxhash32'))%% k) 
}


In [None]:
%%R
k = 3000        # max hash value
num_ip = 8330  # number of unique ip elements
num_port = 62434   # number of unique portelements 
#num_ip_port = 218406   # number of unique ip&pot elements

In [None]:
%%R

N_samples = 1e+7   # number of samples taken for the simulation
hit_matrix = matrix(NA,nrow=N_samples,ncol=2)
colnames(hit_matrix) <- c("ip_hit", "port_hit")
hashed_matrix = matrix(NA,nrow=N_samples,ncol=8)
colnames(hashed_matrix) <- c("ip_s_1", "ip_d_1", "ip_s_2","ip_d_2","port_s_1", "port_d_1", "port_s_2","port_d_2")
draw_matrix = matrix(NA,nrow=N_samples,ncol=8)
colnames(draw_matrix) <- c("ip_s_1", "ip_d_1", "ip_s_2","ip_d_2","port_s_1", "port_d_1", "port_s_2","port_d_2")
for (i in 1:N_samples){
  ip_draw = c( source_and_dest_draw(num_ip), source_and_dest_draw(num_ip) ) # Step 1. For IP addresses
  ip_hashed = hash_all(ip_draw)                                             # Step 2. For IP addresses
  ip_hit = (ip_hashed[1]==ip_hashed[3]) & (ip_hashed[2]==ip_hashed[4])      # Step 3. For IP addresses

  port_draw = c( source_and_dest_draw(num_port), source_and_dest_draw(num_port) ) # Step 1. For Port addresses
  port_hashed = hash_all(port_draw)                                               # Step 2. For Port addresses
  port_hit = (port_hashed[1]==port_hashed[3]) & (port_hashed[2]==port_hashed[4])  # Step 3. For Port addresses
  
  #ip_port_draw = c( source_and_dest_draw(num_ip_port), source_and_dest_draw(num_ip_port) )       # Step 1. For IP_Port addresses
  #ip_port_hashed = hash_all(ip_port_draw)                                                        # Step 2. For IP_Port addresses
  #ip_port_hit = (ip_port_hashed[1]==ip_port_hashed[3]) & (ip_port_hashed[2]==ip_port_hashed[4])  # Step 3. For IP_Port addresses

  #build the final matrices
  hit_matrix[i,] = cbind(ip_hit,port_hit)
  hashed_matrix[i,] = cbind(ip_hashed,port_hashed)
  draw_matrix[i,] = cbind(ip_draw,port_draw)
}


vector_total_hits = apply(hit_matrix, 1, all)   # Step 4. Calculate the probability of a simultaneous hit for all NAEs
prob_total_hit = mean(vector_total_hits)        # Step 4
print(paste0("Probability total hit : ",prob_total_hit))


[1] "Probability total hit : 0"


In [None]:
%%R

# Probabilities of collision on IP, Port separately
prob_ip_hit = mean(hit_matrix[,1]) 
print(paste0("Probability IP hit : ",prob_ip_hit))
prob_port_hit = mean(hit_matrix[,2]) 
print(paste0("Probability Port hit : ",prob_port_hit))

[1] "Probability IP hit : 2e-07"
[1] "Probability Port hit : 2e-07"


In [None]:
%%R
# Theoretical probability for a single NAE
print((1/k)**2)
# Theoretical probability for all 3 NAEs
print((1/k)**6)

[1] 1.111111e-07
[1] 1.371742e-21


In [None]:
%%R
hit_matrix[1:20,]

      ip_hit port_hit
 [1,]  FALSE    FALSE
 [2,]  FALSE    FALSE
 [3,]  FALSE    FALSE
 [4,]  FALSE    FALSE
 [5,]  FALSE    FALSE
 [6,]  FALSE    FALSE
 [7,]  FALSE    FALSE
 [8,]  FALSE    FALSE
 [9,]  FALSE    FALSE
[10,]  FALSE    FALSE
[11,]  FALSE    FALSE
[12,]  FALSE    FALSE
[13,]  FALSE    FALSE
[14,]  FALSE    FALSE
[15,]  FALSE    FALSE
[16,]  FALSE    FALSE
[17,]  FALSE    FALSE
[18,]  FALSE    FALSE
[19,]  FALSE    FALSE
[20,]  FALSE    FALSE


In [None]:
%%R
draw_matrix[1:20,]

      ip_s_1 ip_d_1 ip_s_2 ip_d_2 port_s_1 port_d_1 port_s_2 port_d_2
 [1,]   4277   6101   1475   3542    51524    24047    44229    38960
 [2,]   1728   5855   2219   7702    28318    55430    12060    48848
 [3,]   5726   6583   1623   6986    30708     1821    60306    61032
 [4,]   3279   2082   6362   5759     7353    37262    49491    19239
 [5,]   3658    182   3470   2239     9530    15573    44032     9275
 [6,]   5987   5225   3491   2750    30590    34133    45354     1349
 [7,]   6285   4548   5056    602    26689    18134    33434    41391
 [8,]   3928   5813   3203   6950     2946     3109    29243    45347
 [9,]   7930   3715   5473   6064     5217     8218    18131     5763
[10,]   3238    959   1473   2737    16786    56559    40156    55779
[11,]   5450   7080   4303   7287    51988    56731    39183     1336
[12,]   4613   2709   6586   4468    10457    58840    47672    18889
[13,]   3873   1941   6048   4928    23602    43875      168    10515
[14,]   3488   1609 

In [None]:
%%R
hashed_matrix[1:20,]

      ip_s_1 ip_d_1 ip_s_2 ip_d_2 port_s_1 port_d_1 port_s_2 port_d_2
 [1,]   2884    738   2824   2541     2067     2204      697     1339
 [2,]    326   1996    549   2716      722     1570      231     2237
 [3,]   2642   2948   1159   2524      686     1740     1488     2818
 [4,]   2998   2629   1928   1128     1602     1767     1945     2248
 [5,]    419   1624   2364   1860      888     2392     1910     1440
 [6,]   1575    378   2004    360     2732     2882     1000     2260
 [7,]    439   1800   1303   2183     2040     1286     1751      541
 [8,]    316    442   1388    820     2439      285     2784     2941
 [9,]   1272   2302   2838    741      851     1532      323      322
[10,]    449   2444   2506   2408     1177     2618     1508       30
[11,]    508    107   1671    309        0     1606      700     1008
[12,]   1954   2746   1656   1430     1861     1719     1150     2960
[13,]   2203   2621    265    949     1597      515      169      967
[14,]   1228    823 