# DeadSpace
This notebook processes ETH to_addresses queried till block 10,000,000 and calculates the Chi-Squared and Kolmogorov-Smirnov test of the ETH addresses to determine which addresses are not randomly generated (outliers)

From initial analysis of the outliers from calculated samples Chi and KS values, we can set the filter to be chi > 80 and ks > 0.75

<img src="./media/chi_boxplot.png" alt="chi_boxplot" style="width: 50%;"/>
<img src="./media/ks_boxplot.png" alt="ks_boxplot" style="width: 50%;"/>

In [None]:
import time
from multiprocessing import Pool
import pandas as pd
import scipy
import os
from scipy.stats import kstest
from scipy.stats import chisquare
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
ks_filter = 0.75
chi_filter = 80

In [None]:
#Chi-Squared randomness test
def chi_str(add_str: str):
  # Compute the observed frequency of each character in the string
  observed_freq = {char: add_str.count(char) for char in set(add_str)}

  # Compute the expected frequency of each character in a random string of the same length
  expected_freq = {char: len(add_str) / len(set(add_str)) for char in set(add_str)}

  # Compute the chi-squared statistic
  chi_squared = chisquare(list(observed_freq.values()), f_exp=list(expected_freq.values()))

  return chi_squared.statistic, chi_squared.pvalue


In [None]:
#Kolmogorov-Smirnov randomness Test
def ks_str(add_str: str):

  # Compute the cumulative probability distribution of characters in the string
  distribution = [add_str.index(char) / len(add_str) for char in add_str]

  # Compute the Kolmogorov-Smirnov statistic
  ks_statistic = kstest(distribution, 'uniform')

  return ks_statistic.statistic, ks_statistic.pvalue

In [None]:
def etl(file: str):
  start_t = time.perf_counter()

  # Unzip file
  os.system('gunzip -k /storage/ETH_addresses/' + file + ' -c > /storage/tmp/'+file+'_out.json')
  os.system('cat /storage/tmp/'+file+'_out.json | jq .to_address > /storage/tmp/'+file+'_addresses.json')
  os.system('rm -rf /storage/tmp/'+file+'_out.json')

  # Read file into lines[]
  read_file = open('/storage/tmp/'+file+'_addresses.json', "r")
  lines = read_file.read().strip().replace('"','').replace('null\n','').split("\n")
  read_file.close()

  # PD DataFrame
  df = pd.DataFrame()
  df['Address'] = lines

  # evaluate randomness of each address
  df[['chi','chi_p']] = df.apply(lambda row: chi_str(row['Address']), axis=1, result_type='expand')
  df[['ks','ks_p']]=df.apply(lambda row: ks_str(row['Address']), axis=1, result_type='expand')

  # Apply filter values
  df = pd.concat([df[df['chi'] > chi_filter].drop_duplicates(), df[df['ks'] > ks_filter].drop_duplicates()], axis=0)

  df = df.drop_duplicates()

  # Save output
  df.to_csv('/storage/ETH_randomness/'+file+'.csv')


In [None]:
# Read files to be processed
files = os.listdir('/storage/ETH_addresses/')
start_t = time.perf_counter()

# Run each file on separate CPU core
with Pool() as pool:
  results = pool.imap_unordered(etl, files)
  for filename, duration in results:
    print(f"{filename} completed in {duration:.2f}s")

ent_t = time.perf_counter()
total_duration = ent_t - start_t
print(f"files took {total_duration:.2f}s total")

In [None]:
# sns.boxplot(x=df.chi)
# plt.show()
# sns.boxplot(x=df.ks)
# plt.show()