In [1]:
using Optim

In [2]:
harmonic(n::Integer) = quadgk(x->((1-x^n)/(1-x)), zero(n), one(n)) |> first

harmonic (generic function with 1 method)

In [174]:
function harmonic(n::Integer, m; T::DataType=BigFloat)
    if m==one(m)
        harmonic(n)
    else
        mt = T(m)
        ret = zero(mt)
        for k in 1:n #TODO: Good to do this in Parallel for n>10^6
            ret += inv(k^mt)
        end
        ret
#        oftype(m, ret)
    end
end

harmonic (generic function with 2 methods)

In [None]:
Pkg.add("ArbFloats")

In [197]:
Unum22(1)^Unum22(2)

LoadError: LoadError: ^ not defined for Unums.Unum{2,2,UInt16}
while loading In[197], in expression starting on line 1

In [191]:
harmonic(3_000_000,25//10)
#1.3414620161056519 #Float16
#1.3414620161056519 #Float32
#1.341487257103954  #Float64
#1.341487257122617  #BigFloat

LoadError: LoadError: ^ not defined for Unums.Ubound{2,2,UInt16}
while loading In[191], in expression starting on line 1

In [166]:
Float64(1510380377803909//1125899906842624
)

1.341487257103954

In [5]:
using Distributions
import Distributions: DiscreteUnivariateDistribution, pdf, cdf, logpdf, logcdf, suffstats, fit_mle
import Base: rand, minimum, maximum, mean
import StatsBase.mode
using StatsBase

In [6]:
immutable Zepf{N<:Integer, T1<:Real,T2<:Real} <: DiscreteUnivariateDistribution
    h::T1
    n::N
    s::T2
end

function Zepf{N,T2}(n::N, s::T2)
    n>zero(s) || error("d must be greater than 0 (d=$d)")
    s>one(s) || error("s must be greater than 1 (s=$s)")
    Zepf(harmonic(n,s),n, s)
end

Zepf{N<:Integer,T1<:Real,T2<:Real}

In [7]:
function rand{N,T1}(d::Zepf{N,T1})
    x=Base.rand()
    r=one(N)
    while(0<(x-=pdf(d,r)))
        r+=one(r)
    end
    if r>d.n 
        #Handles floats not quiet lining up, allocate excess probility to first  and thus largest category
        one(N)
    else
        r
    end
end

pdf(d::Zepf, k::Int) = inv(d.h * k^d.s)
logpdf(d::Zepf, k::Int) = -log(d.h) - d.s*log(k)
cdf(d::Zepf, k::Int) = harmonic(k,d.s)/d.h
logcdf(d::Zepf, k::Int) = log(harmonic(k,d.s)) - log(d.h)

#quantile is annoying to implement; but there is probably a really smart way to do it, since we do know the distribution
minimum{N}(::Zepf{N})=one(Int)#(N)
maximum{N}(d::Zepf{N})=d.n |> Int

mean{N,T1,T2}(d::Zepf{N,T1,T2})=harmonic(d.n,d.s-one(T2))/d.h
mode{N}(d::Zepf{N}) = one(N)

mode (generic function with 55 methods)

In [8]:
cdf(Zepf(big"1000",big"1.999"), 1)


6.079525638595357693974084878656164556746957827721866348098622511730935776786549e-01

In [9]:
immutable ZepfStats{N<:Integer} <: SufficientStats
    freqs::Vector{N} #Counts of how many times each element occurs, sorted by rank
    n::N
end

ZepfStats{N}(freqs::Vector{N}) = ZepfStats(freqs, length(freqs))


function suffstats{T<:Integer}(::Type{ZepfStats}, x::AbstractArray{T})
    freqs = counts(x)
    issorted(counts, rev=true) || warn("Catagory IDs are not proportional to frequency. To fit Zeph well, one expects the category freqency to be monotonically decreasing with catagory id.")
    ZepfStats(freqs)
end

suffstats (generic function with 34 methods)

In [151]:
function fit_mle(ss::ZepfStats)
    #Switch to http://stats.stackexchange.com/questions/6780/how-to-calculate-zipfs-law-coefficient-from-a-set-of-top-frequencies
    
    lp=log(ss.freqs./sum(ss.freqs))
    lns=log.(1:ss.n)
    function f(x)
        s=x[1]
        h=harmonic(ss.n, s)
        
        sumabs2(lp .+ s*lns .+ log(h))
    end
    
    best_result=optimize(f, [1.0], BFGS())
    best_result
end

fit_mle (generic function with 54 methods)

In [152]:
result=fit_mle( ZepfStats([80000000000000,200,50,1]))

Results of Optimization Algorithm
 * Algorithm: BFGS
 * Starting Point: [1.0]
 * Minimizer: [25.980415085036025]
 * Minimum: 9.202524e+01
 * Iterations: 3
 * Convergence: true
   * |x - x'| < 1.0e-32: false
   * |f(x) - f(x')| / |f(x)| < 1.0e-32: false
   * |g(x)| < 1.0e-08: true
   * Reached Maximum Number of Iterations: false
 * Objective Function Calls: 13
 * Gradient Calls: 13

In [117]:
result=fit_mle( ZepfStats([26486, 12053, 5052, 3033, 2536, 2391, 1444, 1220, 1152, 1039]))

Results of Optimization Algorithm
 * Algorithm: BFGS
 * Starting Point: [1.0]
 * Minimizer: [1.4639261451126586]
 * Minimum: 1.346248e-01
 * Iterations: 5
 * Convergence: true
   * |x - x'| < 1.0e-32: false
   * |f(x) - f(x')| / |f(x)| < 1.0e-32: false
   * |g(x)| < 1.0e-08: true
   * Reached Maximum Number of Iterations: false
 * Objective Function Calls: 22
 * Gradient Calls: 22

1-element Array{Float64,1}:
 -1.20458

In [12]:
using Optim
foo(x) =  x[1]^2-x[1]-2
result = optimize(foo, [1.0], BFGS())

Results of Optimization Algorithm
 * Algorithm: BFGS
 * Starting Point: [1.0]
 * Minimizer: [0.4999999999973902]
 * Minimum: -2.250000e+00
 * Iterations: 1
 * Convergence: true
   * |x - x'| < 1.0e-32: false
   * |f(x) - f(x')| / |f(x)| < 1.0e-32: false
   * |g(x)| < 1.0e-08: true
   * Reached Maximum Number of Iterations: false
 * Objective Function Calls: 4
 * Gradient Calls: 4