In [47]:
import math
import scipy.stats
import numpy

In [48]:
def getData(direction):
    sample_distribution_file = open(direction, "r")
    data = [float(x.replace(",", ".")) for x in sample_distribution_file.read().splitlines()]
    return data

In [49]:
path = "../continuous/data/data_dagum_4P.txt"
data = getData(path)

In [50]:
def num_bins_doane(data):
    N = len(data)
    skewness = scipy.stats.skew(data)
    sigma_g1 = math.sqrt((6 * (N - 2)) / ((N + 1) * (N + 3)))
    num_bins = 1 + math.log(N, 2) + math.log(1 + abs(skewness) / sigma_g1, 2)
    return math.ceil(num_bins)

def num_bins_freedman_diaconis(data):
    iqr = numpy.percentile(data, 75) - numpy.percentile(data, 25)
    bin_width = 2 * iqr / (len(data) ** (1/3))
    num_bins = (max(data) - min(data)) / bin_width
    return math.ceil(num_bins)


def num_bins_scott(data):
    sigma = numpy.std(data)
    bin_width = 3.5 * sigma / (len(data) ** (1 / 3))
    num_bins = (max(data) - min(data)) / bin_width
    return math.ceil(num_bins)


def num_bins_stone(data):
    kde = scipy.stats.gaussian_kde(data)
    min_error = float("inf")
    best_bins = 10
    for num_bins in range(10, 200):
        bin_edges = numpy.linspace(min(data), max(data), num_bins + 1)
        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
        hist, _ = numpy.histogram(data, bins=bin_edges)
        hist_density = hist / (len(data) * (bin_edges[1] - bin_edges[0]))
        kde_values = kde(bin_centers)
        error = numpy.sum((hist_density - kde_values) ** 2)
        if error < min_error:
            min_error = error
            best_bins = num_bins
    return best_bins


def num_bins_rice(data):
    return math.ceil(2 * (len(data) ** (1 / 3)))


def num_bins_sturges(data):
    return math.ceil(numpy.log2(len(data)) + 1)


def num_bins_sqrt(data):
    return math.ceil(numpy.sqrt(len(data)))


In [51]:
## max("fd", "sturges")
len(numpy.histogram_bin_edges(data, bins="auto"))

67

##### **Freedman Diaconis Estimator**
$$
h = 2 \frac{IQR}{n^{1/3}}
$$

In [52]:
len(numpy.histogram_bin_edges(data, bins="fd")) - 1, num_bins_freedman_diaconis(data)

(66, 66)

##### **Doane**
$$
n_h = 1 + \log_{2}(n) +\log_{2}\left(1 + \frac{|g_1|}{\sigma_{g_1}}\right)\\
g_1 = E\left[\left(\frac{x - \mu}{\sigma}\right)^3\right]\\
\sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
$$

In [53]:
len(numpy.histogram_bin_edges(data, bins="doane")) - 1, num_bins_doane(data)

(18, 18)

##### **Scott**
$$
h = \sigma \sqrt[3]{\frac{24 \sqrt{\pi}}{n}}
$$

In [54]:
len(numpy.histogram_bin_edges(data, bins="scott")) - 1, num_bins_scott(data)

(39, 38)

##### **Stone**
$$

$$

In [55]:
len(numpy.histogram_bin_edges(data, bins="stone")) - 1, num_bins_stone(data)

(40, 14)

##### **Rice**
$$
n_h = 2n^{1/3}
$$

In [56]:
len(numpy.histogram_bin_edges(data, bins="rice")) - 1, num_bins_rice(data)

(26, 26)

##### **Sturges**
$$
n_h = \log _{2}(n) + 1
$$

In [57]:
len(numpy.histogram_bin_edges(data, bins="sturges")) - 1, num_bins_sturges(data)

(13, 13)

##### **Sqrt**
$$
n_h = \sqrt n
$$

In [58]:
len(numpy.histogram_bin_edges(data, bins="sqrt")) - 1, num_bins_sqrt(data)

(47, 47)