# Hamming weight and Pearson Correlation

This notebook demonstrates how to guess a floating number using Hamming Weight model.


We generate a random secret floating number. Then we use hamming weight model to guess the number.

We develop a method of devide-and-conquer to quickly arrive to a close number with a defined precision.

## Guessing...

In [58]:
import struct
import numpy as np
import pandas as pd
import random
from pprint import pprint

from guess_range import *

In [59]:
def generate_known_inputs(exps=(-1, 3), nsize=3000):
    low, high = exps[0], (exps[1] + 1)
    subsize = int(nsize / (high - low))
    return pd.DataFrame(
        data=[np.random.uniform(-10.0 ** i, 10.0 ** i, subsize) for i in range(low, high)],
        index=range(low, high))

In [60]:
def guess_number_range_stat(secret_number, guess_range, prescision, known_input_size=3000):
    known_input_set = generate_known_inputs(exps=(-1, 3), nsize=known_input_size)
    results = guess_number_range_multiple_inputs(secret_number, guess_range, prescision, known_input_set)
    # pprint(results)
    #
    # discard low correlation results
    corr_values = results[CORRELATION]
    results = results[corr_values > (corr_values.max() * 0.95)].sort_values(CORRELATION, ascending=False)
    guess_values = ((results[LOW_VALUE] + results[HIGH_VALUE]) / 2.0).round(decimals=int(-np.log10(prescision)))
    return results[~guess_values.duplicated(keep='first')]

In [61]:
guess_range = (-5e0, 5.0e0)
prescision = 1e-6
#
# initalize parameter and generate a random number (secret number)
known_input_size = 3000

## Test single point

In [62]:
secret_number = random.uniform(guess_range[0] / 10, guess_range[1] / 10) 

In [63]:
single_results = guess_number_range_stat(secret_number, guess_range, prescision, known_input_size)
print(secret_number)
single_results

0.22602603309712088


Unnamed: 0,low value,high value,correlation
1,0.226026,0.226026,0.997718
-1,3.616416,3.616417,0.981344


In [64]:
guess_values = (single_results[LOW_VALUE] + single_results[HIGH_VALUE]) / 2.0
error_rates = ((guess_values - secret_number) / secret_number).abs() * 100.0
pprint(error_rates)
guessed_number = guess_values[error_rates.idxmin()]    
pprint('the secret number = %f' % secret_number)
pprint('the guessed number = %f' % guessed_number)
pprint('best correlation = %f' % single_results[CORRELATION][error_rates.idxmin()])
pprint('error rate = %0.6f%s' % (error_rates.min(), '%'))

 1       0.000068
-1    1500.000099
dtype: float64
'the secret number = 0.226026'
'the guessed number = 0.226026'
'best correlation = 0.997718'
'error rate = 0.000068%'


## Test multiple points

In [8]:
n_secret_numbers = 100
secret_numbers = np.concatenate((
    np.random.uniform(guess_range[0], guess_range[1], int(n_secret_numbers*2/4) ),
    np.random.uniform(guess_range[0]*3e-1, guess_range[1]*3e-1, int(n_secret_numbers*1/4)),
    np.random.uniform(guess_range[0]*1e-1, guess_range[1]*1e-1, int(n_secret_numbers*1/4))
))
np.random.shuffle(secret_numbers)
print('secret_numbers.shape = %s' % (str(secret_numbers.shape)))

secret_numbers.shape = (100,)


In [9]:
SECRET_VALUE = 'secret value'
multiple_results = pd.DataFrame()
for idx in secret_numbers:
    results = guess_number_range_stat(idx, guess_range, prescision)
    results[SECRET_VALUE] = idx
    pprint('=' * 20)
    pprint(results)
    guess_values = (results[LOW_VALUE] + results[HIGH_VALUE]) / 2.0
    multiple_results = pd.concat([multiple_results, results], axis=0, ignore_index=True)

known_input_size = 500


KeyboardInterrupt: 

### Save data

In [None]:
from datetime import datetime
multiple_results_fname = 'multiple_results-%s.csv' % datetime.now().strftime("%Y%m%d_%H%M%S")
multiple_results.to_csv(multiple_results_fname)
pprint('multiple_results_fname = %s' % multiple_results_fname)

### Load data

In [None]:
# multiple_results_fname = 'multiple_results-%s.csv' % '20200522_071337'
multiple_results = pd.read_csv(multiple_results_fname)

### Hit targets and missed targets

In [None]:
#
# compute the number of secret values which are present in guessed ranges
n_identified_ranges = multiple_results.groupby(SECRET_VALUE).apply(lambda x: ((x[LOW_VALUE]<=x[SECRET_VALUE]) & (x[SECRET_VALUE]<=x[HIGH_VALUE])).any())
counts = n_identified_ranges.value_counts()
if True in counts.index:
    pprint('The number of identified values: %d/%d' % (counts[True], counts.sum()))
if False in counts.index:
    pprint('The number of missed values: %d/%d' % (counts[False], counts.sum()))

### Graphs of the first choices

In [None]:
first_choice_results = multiple_results.groupby(SECRET_VALUE).first()
guess_values = ((first_choice_results[LOW_VALUE] + first_choice_results[HIGH_VALUE]) / 2.0).sort_index()
ax = guess_values.plot(figsize = (12, 6), marker='.', label='guessed values')
ax.plot(guess_values.index, guess_values.index, marker='.', linewidth=1, label='secret_numbers')
ax.legend()
ax.set_xlabel('secret values')
ax.set_ylabel('guessed values')
ax.set_title('the first choices')
ax.grid(True)

In [None]:
first_choice_results = multiple_results.groupby(SECRET_VALUE).first()
guess_values = ((first_choice_results[LOW_VALUE] + first_choice_results[HIGH_VALUE]) / 2.0).sort_index()
error_rate = ((guess_values - guess_values.index)/guess_values.index).abs()
ax = error_rate.plot(figsize = (12, 6), label='error rate', marker='.', linewidth=1)
ax.legend()
ax.set_xlabel('secret values')
ax.set_ylabel('error rates')
ax.set_title('the first choices')
ax.grid(True)

### Graphs of the second choice

In [None]:
pprint('The total number of the second choices is: %d' % (multiple_results.groupby(SECRET_VALUE).size().prod()-1))

In [None]:
second_choice_results = multiple_results[np.logical_and(
    multiple_results[LOW_VALUE]<=multiple_results[SECRET_VALUE],
    multiple_results[SECRET_VALUE] <= multiple_results[HIGH_VALUE])].set_index(SECRET_VALUE)

guess_values = ((second_choice_results[LOW_VALUE] + second_choice_results[HIGH_VALUE]) / 2.0).sort_index()
ax = guess_values.plot(figsize = (12, 6), marker='.', label='guessed values')
ax.plot(guess_values.index, guess_values.index, marker='.', linewidth=1, label='secret_numbers')
ax.legend()
ax.set_xlabel('secret values')
ax.set_ylabel('guessed values')
ax.set_title('the second choices')
ax.grid(True)

In [None]:
second_choice_results = multiple_results[np.logical_and(
    multiple_results[LOW_VALUE]<=multiple_results[SECRET_VALUE],
    multiple_results[SECRET_VALUE] <= multiple_results[HIGH_VALUE])].set_index(SECRET_VALUE)

guess_values = ((second_choice_results[LOW_VALUE] + second_choice_results[HIGH_VALUE]) / 2.0).sort_index()

error_rate = ((guess_values - guess_values.index)/guess_values.index).abs()
ax = error_rate.plot(figsize = (12, 6), label='error rate', marker='.', linewidth=1)
ax.legend()
ax.set_xlabel('secret values')
ax.set_ylabel('error rates')
ax.set_title('the second choices')
ax.grid(True)

# Batina method

In [None]:
def batina_guess_number(secret_number, guess_range, prescision, known_inputs):
    low, high = guess_range
    guess_val = np.arange(low, high, prescision)
    hw = pd.DataFrame(columns=guess_val,
                        data=np.vectorize(hamming_weight)(known_inputs.reshape(-1, 1) * guess_val))
    hw['actual'] = np.vectorize(hamming_weight)(known_inputs * secret_number)
    return hw.corr(method='pearson')['actual'].drop('actual').idxmax()

In [None]:
batinta_results = pd.Series(index=secret_numbers, name='guessed_numbers', dtype=np.float32)

known_inputs = np.random.uniform(-1e1, 1e1, known_input_size)

# we have to define a low precision, otherwise, it takes too long time to run
prescision = 1e-3

for idx in batinta_results.index:
    batinta_results[idx] = batina_guess_number(idx, guess_range, prescision, known_inputs)
    print('secret_value = %f, guessed_value = %f' % (idx, batinta_results[idx]))
batinta_results.sort_index(inplace=True)    

### Save data

In [None]:
from datetime import datetime
batinta_results_fname = 'batinta_results-%s.csv' % datetime.now().strftime("%Y%m%d_%H%M%S")
batinta_results.to_csv(batinta_results_fname)
pprint('batinta_results_fname = %s' % batinta_results_fname)

### Load data

In [None]:
# batinta_results_fname = 'batinta_results-%s.csv' % '20200522_071337'
batinta_results = pd.read_csv(batinta_results_fname, index_col=0)
batinta_results.sort_index(inplace=True)    
batinta_results

### Graphs

In [None]:
ax = batinta_results.plot(figsize = (12, 6), marker='.', label='batina')
#results.plot(ax=ax, marker='.', label='bxlab')
ax.plot(batinta_results.index, batinta_results.index, marker='.', linewidth=1, label='secret_numbers')
ax.legend()
ax.set_xlabel('secret values')
ax.set_ylabel('guessed values')
ax.grid(True)

In [None]:
batina_error_rate = ((batinta_results - batinta_results.index)/batinta_results.index).abs()
ax = batina_error_rate.plot(figsize = (12, 6), label='batina error rate')
#ax = error_rate.plot(ax=ax, label='error rate')
ax.legend()
ax.set_xlabel('secret values')
ax.set_ylabel('error rates')
ax.grid(True)