-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfit_distribution.py
66 lines (52 loc) · 1.9 KB
/
fit_distribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
Most of this code is from
https://github.com/merely-useful/py-rse/blob/book/zipf/bin/plotcounts.py originally
released under a CC-BY license by the Research Software Engineering for Python
authors.
"""
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.optimize import minimize_scalar
def compute_summary(word_counts):
"""
Compute summary stats for the word count distribution.
"""
wc = np.array(list(word_counts.values()))
alpha, C = estimate_zipf(wc)
return {"total_words": wc.sum(), "distinct_words": len(wc), "alpha": alpha, "C": C}
def estimate_zipf(word_counts):
"""
Fit Zipf distribution to a a set of word counts.
Arguments:
word_counts: distribution of word counts, as a numpy array
Returns:
The alpha parameter of the word count distribution.
References:
Moreno-Sanchez et al (2016) define alpha (Eq. 1),
beta (Eq. 2) and the maximum likelihood estimation (mle)
of beta (Eq. 6).
Moreno-Sanchez I, Font-Clos F, Corral A (2016)
Large-Scale Analysis of Zipf's Law in English Texts.
PLoS ONE 11(1): e0147073.
https://doi.org/10.1371/journal.pone.0147073
"""
assert (
type(word_counts) == np.ndarray
), "Input must be a numerical (numpy) array of word counts"
mle = minimize_scalar(
_nlog_likelihood, bracket=(1 + 1e-10, 4), args=word_counts, method="brent"
)
beta = mle.x
alpha = 1 / (beta - 1)
# Estimate the constant C so that this distribution integrates to 1.
# https://en.wikipedia.org/wiki/Zipf%27s_law
C = ((np.arange(len(word_counts)) + 1) ** (-alpha)).sum()
return alpha, C
def _nlog_likelihood(beta, counts):
"""Log-likelihood function."""
likelihood = -np.sum(
np.log((1 / counts) ** (beta - 1) - (1 / (counts + 1)) ** (beta - 1))
)
return likelihood