<a href="https://colab.research.google.com/github/maciejskorski/ml_examples/blob/master/RenyiEntropyEstimation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Renyi Entropy Estimation

This project implements the sample-optimal adaptive Renyi entropy estimator.

# Dataset

In [36]:
import pandas as pd

!curl -o birthdays.csv https://www.panix.com/~murphy/bdata.txt # downloiad data
df = pd.read_csv('birthdays.csv',sep=' ') # read as dataframe
df = df[:-1] # skip control row
df['date'] = pd.to_datetime('2000'+df['date']) # convert dates
df.head()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4049  100  4049    0     0  19099      0 --:--:-- --:--:-- --:--:-- 19009


Unnamed: 0,date,count
0,2000-01-01,1482
1,2000-01-02,1213
2,2000-01-03,1220
3,2000-01-04,1319
4,2000-01-05,1262


In [120]:
import pandas as pd
host = 'raw.github.com'
user = 'fivethirtyeight'
repo = 'data'
branch = 'master'
file = 'births/US_births_2000-2014_SSA.csv'
url = f'https://{host}/{user}/{repo}/{branch}/{file}'
df = pd.read_csv(url,sep=',',header=0)
df['date'] = df[['year','month','date_of_month']].astype(str).apply('-'.join,axis=1)
df['date'] = pd.to_datetime(df['date'])
df = df[['date','births']]
df.head()

Unnamed: 0,date,births
0,2000-01-01,9083
1,2000-01-02,8006
2,2000-01-03,11363
3,2000-01-04,13032
4,2000-01-05,12558


# Birthday Inference

In [123]:
import numpy as np
from scipy.special import binom

birthdays = df.groupby(df['date'].dt.day_of_week)['births'].sum()

p_col = binom(birthdays,2).sum()/binom(birthdays.sum(),2)

print('Birthday on same day probability is 	{:.4f}'.format(p_col))
print('Collision probability of a distribution uniform over 7 elements is 	{:.4f}'.format(1/7))

Birthday on same day probability is 	0.1479
Collision probability of a distribution uniform over 7 elements is 	0.1429


In [89]:
m = 7
n = df['births'].sum()

n = birthdays.sum()/100

gamma = p_col-1/m


In [95]:
birthdays/birthdays.sum()

day_of_week
1    0.149806
2    0.165225
3    0.162560
4    0.161536
5    0.158396
6    0.107812
7    0.094664
Name: births, dtype: float64

In [90]:
gamma - np.sqrt(4*gamma/(m*n)+4*gamma**1.5/n)

0.004994285543601433

In [93]:
def confidence_ub(gamma,m=12,n=n):
  return gamma+np.sqrt(4*gamma/(m*n)+4*gamma**1.5/n)


def confidence_lb(gamma,m=12,n=n):
  return gamma-np.sqrt(4*gamma/(m*n)+4*gamma**1.5/n)


confidence_ub(0.1423)

0.14294927053337955

In [75]:
p_col

0.08350754159367922