In [None]:
%load_ext rpy2.ipython

  from pandas.core.index import Index as PandasIndex


In [None]:
%%R
library(digest)

k = 3000        # max hash value
num_ip = 8330  # number of unique ip elements
num_port = 62434   # number of unique portelements 
num_ip_port = 218406   # number of unique ip&pot elements

hex_to_int = function(h) {
  xx = strsplit(tolower(h), "")[[1L]]
  pos = match(xx, c(0L:9L, letters[1L:6L]))
  sum((pos - 1L) * 16^(rev(seq_along(xx) - 1)))
}

#Procedure
# Step 1
# create matrix (draw_matrix) of samples in rows and 12 columns with the first 4 columns being IP first draw source and destination addresses 
# and second draw source and destination adresses, the next 4 columns being Port first draw source and destination addresses 
# and second draw source and destination adresses and the last 4 columns being  IP&Port first draw source and destination addresses 
# and second draw source and destination adresses, the next 4 columns being Port first draw source and destination addresses 
# and second draw source and destination adresses
# Step 2
# Apply a hash function to draw_matrix using the maximum hash value (k). This produces the hashed_matrix.
# Step 3
# Detect a hit if the first and second draw from the source have the same hash value and at the same time 
# the first and second draw from the destination also have the same hash value. This procedure produce a matrix of three columns for the 
# hits on ip, port and ip_port addresses. This produces the hit_matrix.
# Step 4
# Calculate the probability of a simultaneous collision/hit in the ip, port and ip&port for the source and destiantion addresses.

w=sample(1:100000, 1)
source_and_dest_draw <- function(num_max) {
  #draw a source and dest number from the range [1-num_max]
  w <<- w+1
  set.seed(w) #This is necessary because the clock time does not chage fast enough
  return(sample(1:num_max,2,replace=F))
}

hash_all <- function(vals) {
  # convert numbers to hash values 
  # the hashed valued are in the range [1-num_max]
  return(sapply(vals,hash_element) )
}


hash_element <- function(val) {
  # convert numbers to hash values 
  # the hashed valued are in the range [1-k]
  return(hex_to_int(digest(val, algo='xxhash32'))%% k) 
}


In [3]:
%%R
start.time <- Sys.time()

N_samples = 1e+7   # number of samples taken for the simulation
hit_matrix = matrix(NA,nrow=N_samples,ncol=3)
colnames(hit_matrix) <- c("ip_hit", "port_hit", "ip_port_hot")
hashed_matrix = matrix(NA,nrow=N_samples,ncol=12)
colnames(hashed_matrix) <- c("ip_s_1", "ip_d_1", "ip_s_2","ip_d_2","port_s_1", "port_d_1", "port_s_2","port_d_2", "ip_port_s_1", "ip_port_d_1", "ip_port_s_2","ip_port_d_2")
draw_matrix = matrix(NA,nrow=N_samples,ncol=12)
colnames(draw_matrix) <- c("ip_s_1", "ip_d_1", "ip_s_2","ip_d_2","port_s_1", "port_d_1", "port_s_2","port_d_2", "ip_port_s_1", "ip_port_d_1", "ip_port_s_2","ip_port_d_2")
for (i in 1:N_samples){
  ip_draw = c( source_and_dest_draw(num_ip), source_and_dest_draw(num_ip) ) # Step 1. For IP addresses
  ip_hashed = hash_all(ip_draw)                                             # Step 2. For IP addresses
  ip_hit = (ip_hashed[1]==ip_hashed[3]) & (ip_hashed[2]==ip_hashed[4])      # Step 3. For IP addresses

  port_draw = c( source_and_dest_draw(num_port), source_and_dest_draw(num_port) ) # Step 1. For Port addresses
  port_hashed = hash_all(port_draw)                                               # Step 2. For Port addresses
  port_hit = (port_hashed[1]==port_hashed[3]) & (port_hashed[2]==port_hashed[4])  # Step 3. For Port addresses
  
  ip_port_draw = c( source_and_dest_draw(num_ip_port), source_and_dest_draw(num_ip_port) )       # Step 1. For IP_Port addresses
  ip_port_hashed = hash_all(ip_port_draw)                                                        # Step 2. For IP_Port addresses
  ip_port_hit = (ip_port_hashed[1]==ip_port_hashed[3]) & (ip_port_hashed[2]==ip_port_hashed[4])  # Step 3. For IP_Port addresses

  #build the final matrices
  hit_matrix[i,] = cbind(ip_hit,port_hit,ip_port_hit)
  hashed_matrix[i,] = cbind(ip_hashed,port_hashed,ip_port_hashed)
  draw_matrix[i,] = cbind(ip_draw,port_draw,ip_port_draw)
}


vector_total_hits = apply(hit_matrix, 1, all)   # Step 4. Calculate the probability of a simultaneous hit for all NAEs
prob_total_hit = mean(vector_total_hits)        # Step 4
print(paste0("Probability total hit : ",prob_total_hit))

end.time <- Sys.time()
time.taken <- end.time - start.time
print(time.taken)


[1] "Probability total hit : 0"
Time difference of 3.403605 hours


In [12]:
%%R

# Probabilities of collision on IP, Port and IP&Port separately
prob_ip_hit = mean(hit_matrix[,1]) 
print(paste0("Probability IP hit : ",prob_ip_hit))
prob_port_hit = mean(hit_matrix[,2]) 
print(paste0("Probability Port hit : ",prob_port_hit))
prob_ip_port_hit = mean(hit_matrix[,3]) 
print(paste0("Probability IP&Port hit : ",prob_ip_port_hit))

[1] "Probability IP hit : 1e-07"
[1] "Probability Port hit : 1e-07"
[1] "Probability IP&Port hit : 1e-07"


In [13]:
%%R
# Theoretical probability for a single NAE
print((1/k)**2)
# Theoretical probability for all 3 NAEs
print((1/k)**6)

[1] 1.111111e-07
[1] 1.371742e-21


In [14]:
%%R
hit_matrix[1:20,]

      ip_hit port_hit ip_port_hot
 [1,]  FALSE    FALSE       FALSE
 [2,]  FALSE    FALSE       FALSE
 [3,]  FALSE    FALSE       FALSE
 [4,]  FALSE    FALSE       FALSE
 [5,]  FALSE    FALSE       FALSE
 [6,]  FALSE    FALSE       FALSE
 [7,]  FALSE    FALSE       FALSE
 [8,]  FALSE    FALSE       FALSE
 [9,]  FALSE    FALSE       FALSE
[10,]  FALSE    FALSE       FALSE
[11,]  FALSE    FALSE       FALSE
[12,]  FALSE    FALSE       FALSE
[13,]  FALSE    FALSE       FALSE
[14,]  FALSE    FALSE       FALSE
[15,]  FALSE    FALSE       FALSE
[16,]  FALSE    FALSE       FALSE
[17,]  FALSE    FALSE       FALSE
[18,]  FALSE    FALSE       FALSE
[19,]  FALSE    FALSE       FALSE
[20,]  FALSE    FALSE       FALSE


In [15]:
%%R
draw_matrix[1:20,]

      ip_s_1 ip_d_1 ip_s_2 ip_d_2 port_s_1 port_d_1 port_s_2 port_d_2
 [1,]   2420   3055   1241   1886    19995    61066    40579    54757
 [2,]   5064   6340   4896   5255    31908    32658    34423    27154
 [3,]   5399   1856   1438   1106    54535    56599    37661    22154
 [4,]   5030   7775    265   2823     2322      334    56246    54395
 [5,]   7826   5367   7437   6639    51363    50255    31723     3346
 [6,]   5306   6980   2197   7870    43130    27998    10742    27223
 [7,]   2746   5614     65   2780    54664    37078    46249    52158
 [8,]   7521   2735   5402    861    33880     3433    25791     1086
 [9,]   5016   1154   3288   3685    33598    30563    16059    49801
[10,]   2452   6059   5026   8271    24900    48954    61803    43161
[11,]   5873   3316   2994   4003    31819    20409    28270    40334
[12,]   1385   2459   8228   2021    47876    36972    47143     3484
[13,]   2341   4662    503   1198    31193    39993     4960    18796
[14,]   5271   1423 

In [16]:
%%R
hashed_matrix[1:20,]

      ip_s_1 ip_d_1 ip_s_2 ip_d_2 port_s_1 port_d_1 port_s_2 port_d_2
 [1,]   1133    563   2637    857     2361     2339     1860      384
 [2,]    440   2137   2892   2691     1919       74      264     1378
 [3,]   1236   2876    410     51     2478     1892     2223      872
 [4,]   2559   2992   1814    289     1088      930     1968     2171
 [5,]   1121   1623    242    474      802       51     1487      383
 [6,]    522   1269   1658    966     2292      290     1989      303
 [7,]   2494    211    666   2412     2078      218     1177      205
 [8,]   2283   2605   2065   2114     1220     2498     1465     1337
 [9,]   2287   2555   2871   1892     1597      254      242     2128
[10,]   1709   1228    187   2343     1218     1562     2387      443
[11,]    374   1151   1535   1542      910     1057     2050     2212
[12,]   2807   1613   2892   2978     2587      500       54      381
[13,]   2713   2164   1956   1299      473     2917      717      468
[14,]    910   1114 

In [28]:
# Store all  matrices

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
# set working directory
import os
os.chdir("//content/drive/My Drive/Colab Notebooks/")

# Check current working directory.
retval = os.getcwd()
print ("Current working directory %s" % retval)

Current working directory /content/drive/My Drive/Colab Notebooks


In [25]:
%%R
write.csv(hit_matrix, file='hit_matrix.csv')

In [26]:
%%R
write.csv(hashed_matrix, file='hashed_matrix.csv')

In [27]:
%%R
write.csv(draw_matrix, file='draw_matrix.csv')