In [31]:
using Pkg
Pkg.instantiate()
using HTTP, JSON, JLD, Roots, PrettyTables, DotEnv, Dates
DEBUG_LEVEL = 1
using Optim, Random
Random.seed!(0)
using BlackBoxOptim, Distributions, ForwardDiff, Integrals, Roots, StatsPlots, DelimitedFiles
cfg = DotEnv.config("../.env")
files_path = cfg["files_path"]

"/home/peters/code/mapinator/estimation/current_estimates_and_files/"

# Version 2 Changes

The second round is dropped in this version.  The unmatched are assumed to be all those in ocean and crow - reasoning is that they were matched with institutions that didn't participate in the international job market (because the never posted a job on econjobmarket and never graduated a student who registered with econjobmarket). 

Change in the way $\alpha$  is computed.  The assumption that failed trades are recorded in ocean and crow means that the share 
$\alpha$  of the market can be estimated by using the recorded number of outcomes for a tier in the adjusted adjacency matrix divided by the estimated level of demand. More accurately the proportion of the market that could be filled by type $i$  applicants is
$$
\alpha_i = \frac{m_i}{n}
$$

Change in the way $\rho$ is computed.  Sometimes it seems using more variables just seems to give the algorithm a wierd direction to go in.  To avoid that, the $\rho_i$ are now given by the proportion of all hires that were made by each tier.  Not perfect.

The solution vector now has 27 elements 4 value ratios, 1 demand estimate, 11 estimated means and 1 estimated variances..  The parameters estimated for the means and variances are the mean and variance of a standard normal distribution.  This distribution is truncated so that all the values in its support lie in $[0,1]$.  This means that the mean and variance for each hiring tier nneed to be recomputed to get the mean and variance of the trucated distribution itself.

Adjacency table is taken from the table created by scaling up each tier is the actual adjacency matrix according to the coverage rates for each tier.(mike_adjust_adjaceny.ipynb).  The adjusted adjacency matrix used here is created by shifting the 10th row to the bottom of the matrix to move the ocs outcomes to the bottom.

In [2]:
# change associated with ocs as failed to trade group
raw_placement_rates = load(files_path*"adjusted_placement_rates.jld")["adjusted_placement_rates"]
unmatched_row_index = 10
adjusted_placement_rates = 
vcat(raw_placement_rates[1:unmatched_row_index-1,:],
raw_placement_rates[unmatched_row_index+1:size(raw_placement_rates)[1],:])
# row names for reporting results

#adjusted_placement_rates = load(files_path*"adjusted_placement_rates.jld")["adjusted_placement_rates"]
# number of academic types
NUMBER_OF_TYPES = size(adjusted_placement_rates, 2)
# add unmatched to make and additional row
adjusted_placement_rates = vcat(adjusted_placement_rates, 
    raw_placement_rates[unmatched_row_index:unmatched_row_index, :])
# number of rows in the adj matrix
numtotal = size(adjusted_placement_rates, 1)

12

In [34]:
# this hack gets beliefs rho by setting it equal to the proportion of all placements
# used in estimation
# notice it uses only the first K-1 of the rows of the adjusted adjacency matrix.
function get_rho(adjusted_placement_rates)
    a = sum(adjusted_placement_rates, dims = 2)
    b = zeros(size(a)[1] - 1)
    for i in 1:size(a)[1]-1
        b[i] = a[i]
    end
    #experiment with rho
    B = sum(b)
    return b = b./B
end

get_rho (generic function with 1 method)

In [4]:
#setup stuff
k = NUMBER_OF_TYPES
# number of recruiter types, note this is one less that the number of rows in the adjacency matrix
K = numtotal-1
## sample size for computed expectations
M = sum(adjusted_placement_rates)
# need the column sums to calculate \alpha
workers = sum(adjusted_placement_rates, dims= 1)
#maximum likelihood  true for ML false for chi-squared
L = true
#chi-squared
#L = false

# upper bound on the value ratios, which should all be less than 1
# if any ratio turns out to be 1.0 or close to it at optimality, this could indicate that a lower tier has a higher value than a higher one
# these bounds aren't actually used since the optimizer wouldn't accept abstract argument.  
# They are copied into hard coded text instead.
search_range = Tuple[]
for i in 1:k-1
    push!(search_range, (0.0, 1.0))
end

# upper bound on variables proportionate to alpha
push!(search_range, (Float64(sum(workers)), Inf))

# upper bound on the mu parameter of truncated normal, which is strictly within [0, 1] as the mean is greater than mu in truncated normal

for i in k+1:k+K
    push!(search_range , (.1,1.0))
end

# upper bound on the sigma parameter of truncated normal
for i in k+K+1:k+2K
    push!(search_range , (.1,5.0))
end

# these are parameters that generate beliefs, not the beliefs themselves
#for i in k+2K+1:k+3K
#   push!(search_range , (.1,1.))
#end


In [5]:
# this is needed to produce a hard coded array of ranges and is used in the optimizer below
println(search_range)
size(search_range)

Tuple[(0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (27489.0, Inf), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0)]


(27,)

In [6]:
# these are the core theoretical functions
function F(x, ρ, μ, σ, K)
    return sum([ρ[i] * cdf(truncated(Normal(μ[i], σ[i]), 0, 1), x) for i in 1:K])
end

function f(x, ρ, μ, σ, K)
    return sum([ρ[i] * pdf(truncated(Normal(μ[i], σ[i]), 0, 1), x) for i in 1:K])
end

function G(Fim1, Fx, αsum)
    return (Fim1 - Fx) / αsum
end

function κ(i, t, v_rel)
    return sum([-log(v_rel[j]) for j in t:i-1])
end

function fi(x, μ, σ)
    return pdf(truncated(Normal(μ, σ), 0, 1), x)
end

fi (generic function with 1 method)

In [7]:
function q(i, t, Fx_vec, x_vec, ρ, μ, σ, α, v_rel, k, K)
    # Fx_vec = [F(x0)=1, F(x1), F(x2), F(x3), ..., F(xk-1)]
    # x_vec = [x0 = 1, x1, x2, x3, ..., xk = 0]
    # x_vec[s] = x_{s-1}, so the limits of integration are and must be offset by 1 below
    # TODO: can some integrals be cached as a speed-up? can some integrals be computed in parallel?
    return sum([(α[t]/sum(α[1:s])) * 
            solve(IntegralProblem{false}((x, p) -> exp(-(G(Fx_vec[s], F(x, ρ, μ, σ, K), sum(α[1:s])) + κ(s, t, v_rel))) *
                    fi(x, μ[i], σ[i]), x_vec[s+1], x_vec[s]), HCubatureJL())[1] for s in t:k])
end

function Fx(t, α, v_rel)
        return 1 - sum([-log(v_rel[j])*sum(α[1:j]) for j in 1:t])
end

function Q2(ratio, β)
    return β * (1 - exp(-ratio)) / ratio
end

function pi(t, α)
    return α[t] / sum(α[1:t])
end

pi (generic function with 1 method)

In [40]:
"""
Likelihood
    p_vec -> k values,k alpha shares, K means, K variances, K beliefs
    example -  for 5 academic tiers and six hiring tiers p_vec has 43 elements
    - it assumes the last row of the adjacency matrix is for unmatched so the placement matrix in the example has dimension 12x5 not 11x5
    placements -> the adjacency matrix with the unmatched row moved to the bottom
    k -> number of academic tiers
    K -> rows in the adjacency matrix - last row is treaed as unmatched
    L -> true for likelihood, false for chi-squared maximization
"""
function estimate_likelihood(p_vec, placements, k, K, M, L, verify)
    #values
    v_rel = p_vec[1:k-1]
    #println(v_rel)
    #shares
    α = []
    for i in 1:k
        push!(α, workers[i]/p_vec[k])
    end
    #means
    μ = p_vec[k+1:k+K]
    #variances
    #println(μ)
    σ = p_vec[k+K+1:k+2K]
    #println(σ)
    #setting rho exogenously
    ρ = get_rho(placements)
    Fx_vec = ones(k)
    x_vec = ones(k+1)
    x_vec[k+1] = 0.0
    for t in 1:k-1
        Fx_vec_candidate = Fx(t, α, v_rel)
        #println(v_rel[t], " ", -log(v_rel[t])*sum(α[1:t]))
        if Fx_vec_candidate <= 0.0
            Fx_vec[t+1:k] .= 0.0
            x_vec[t+1:k] .= 0.0
            break
        end
        Fx_vec[t+1] = Fx_vec_candidate
        if  Fx_vec[t+1] > 0 && Fx_vec[t+1] < 1
            x_vec[t+1] = find_zero(x -> F(x, ρ, μ, σ, K) - Fx_vec[t+1], 0.5)
        else
            x_vec[t+1] = 1
        end
    end    
    objective = 0.0
    o = 0.0
    likelihood = 0.0
    # a sanity checker
    normalizer = zeros(K)
    # recode
    # 1-normalizer is now the probabiity a placement should end up in ocs  
    q_it = zeros(K, k)
    for i in 1:K, t in 1:k
        prob = q(i, t, Fx_vec, x_vec, ρ, μ, σ, α, v_rel, k, K)
        q_it[i, t] = prob
        normalizer[i] +=  prob
    end
    #normalizer is the probability each recruiter type is placed, it should be less than 1
    #check by rerunning this function with the optimal solution and set verify = to true
    if verify
        println(normalizer)
    end
    for i in 1:K, t in 1:k 
        expectation = M * ρ[i] * q_it[i, t] + .5
        #println(expectation, " ", placements[i,t], " ",expectation-placements[i,t])
        if L
            # likelihood
            objective += (placements[i, t] + 1) * log(ρ[i] * q_it[i, t]/normalizer[i])
            objective -= log(factorial(big(placements[i, t])))
        else
            objective += (placements[i, t] - expectation) ^ 2 / expectation
        end
    end
    ## this next bit is for the market failures, it is just commented out
    ## so only placements are used to compute likelihood
    if L
        return Float64(-objective)
    else 
        return Float64(objective)
    end
end


estimate_likelihood

In [44]:
# a sample solution for testing purposes
v = [.68, .75, .75, .75] 
num = sum(workers)
mu = [.88,.5, .4, .2, .05, .3, .8, .1, .3, .1, .1]
sigma = [.3,.4, .5,.6,.7,.8,1,1.1,1.3,.1,.9]
trial = vcat(v,num,mu,sigma)
soln = estimate_likelihood(s, adjusted_placement_rates, k, K, M, true, true)

[0.502585890445317, 0.4261705082459622, 0.389590180530203, 0.25620504373282965, 0.17826277002229263, 0.3985442806815523, 0.40174460118160094, 0.2791644267605973, 0.21199141790006967, 0.37720291427281694, 0.2797195824694928]


202023.6208742913

In [45]:

# all lower bounds are zero as these should be positive parameters
# can swap estimate_likelihood for chi_square - change the last argument to 
# true for maximum likelihood, false for chi-sqared estimation
println(k, " ",K," ",M)
sol_res = bboptimize(p -> estimate_likelihood(p, adjusted_placement_rates, k, K, M, true, false),
    SearchRange = [(0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (0.0, 1.0), (27489.0, 30000), 
        (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), 
        (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 1.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), 
        (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0), (0.1, 5.0),
        (0.1, 5.0)], 
    MaxFuncEvals = 100000, TraceInterval = 5)
sol = best_candidate(sol_res)

5 11 27489
Starting optimization with optimizer DiffEvoOpt{FitPopulation{Float64}, RadiusLimitedSelector, BlackBoxOptim.AdaptiveDiffEvoRandBin{3}, RandomBound{ContinuousRectSearchSpace}}
0.00 secs, 0 evals, 0 steps
5.00 secs, 1053 evals, 979 steps, improv/step: 0.526 (last = 0.5260), fitness=204166.096224573
10.01 secs, 2065 evals, 1991 steps, improv/step: 0.408 (last = 0.2935), fitness=203885.976180250
15.01 secs, 3053 evals, 2979 steps, improv/step: 0.360 (last = 0.2642), fitness=203488.770660444
20.01 secs, 4028 evals, 3954 steps, improv/step: 0.324 (last = 0.2144), fitness=203216.500468186
25.01 secs, 4992 evals, 4918 steps, improv/step: 0.296 (last = 0.1815), fitness=203097.525303934
30.02 secs, 5962 evals, 5888 steps, improv/step: 0.278 (last = 0.1856), fitness=202990.911688207
35.03 secs, 6908 evals, 6834 steps, improv/step: 0.264 (last = 0.1755), fitness=202972.685628048
40.03 secs, 7875 evals, 7801 steps, improv/step: 0.248 (last = 0.1324), fitness=202715.799654822
45.03 secs,

27-element Vector{Float64}:
     0.6058087089729725
     0.7610693549628851
     0.717100191084767
     0.7095512542139258
 29999.99999993654
     0.9999999996066106
     0.6281495027408868
     0.5517704157777589
     0.35946838157843275
     0.10000000002760376
     0.5753919816272435
     0.7030706437832919
     0.2651779400444991
     ⋮
     0.3125260801170404
     0.35662476274389476
     0.19963216973144735
     0.1504438275326322
     0.10000000000204336
     0.10000000000882125
     0.21787500525208914
     0.5051533993365251
     0.20040867659872774
     0.10000000032310519
     0.301447993865664
     0.15827163057786978

In [68]:
"""
    sol_res -> created by by bboptimize
    placements ->   the adjusted placements matrix including unmatched (i.e, K rows) 
        dimensions are computed from matrix sizescomputer from the size of this matrix
    workers ->  the actual number of placements by workers in each tier. 
    This number divided by estimated demand is \alpha in the theory
"""
function print_solution(sol_res, placements, workers) 
    k = size(placements)[2]
    K = size(placements)[1]-1
    M = sum(placements)
    values_table = zeros(2,k)
   
    sol = best_candidate(sol_res)
    println("fitness = ", best_fitness(sol_res))
    println()
    println("Estimated Demand: ", sol[k])
    println()
    println("Values Table:")
    head1 = []
    row1 = ["Values", "Alpha"]
    push!(head1, "  Tier 1: ")
    start = 1
    values_table[1,1] = start
    values_table[2,1] = workers[1]/sol[k]
    for i in 1:k-1
        start = start*sol[i]
        push!(head1, "  Tier "* string(1+i)*":")
        values_table[1, i+1] = start
        values_table[2, i+1] = workers[i+1]/sol[k]
    end
    pretty_table(values_table, header = head1, row_labels=row1, backend = Val(:text))
    
    ρ = get_rho(placements)
    hiring_table = zeros(K,5)
    head2 = ["Mean", "Variance", "Mean (Parameter)", "Variance(Parameter)", "Beliefs"]
    row_names = load(files_path*"row_names.jld")["names"]
    row2 = []
    for i in 1:size(row_names)[1]
        if i == 10
            continue
        end
        push!(row2, row_names[i])
    end

    for i in 1:K
        hiring_table[i,1] = mean(truncated(Normal(sol[k+i],sol[k+K+i]), 0, 1))
        hiring_table[i,2] = var(truncated(Normal(sol[k+i],sol[k+K+i]), 0, 1))
        hiring_table[i,3] = sol[k+i]
        hiring_table[i,4] = sol[k+K+i]
        hiring_table[i,5] = ρ[i]
    end
    pretty_table(hiring_table, header = head2, row_labels=row2, backend = Val(:text))
   return values_table, hiring_table, head1, row1, head2, row2
end

print_solution

In [69]:
values_table, hiring_table, head1, row1, head2, row2 = print_solution(sol_res,adjusted_placement_rates, workers);

fitness = 202262.7882068646

Estimated Demand: 29999.99999993654

Values Table:
┌────────┬────────────┬───────────┬───────────┬───────────┬───────────┐
│[1m        [0m│[1m   Tier 1:  [0m│[1m   Tier 2: [0m│[1m   Tier 3: [0m│[1m   Tier 4: [0m│[1m   Tier 5: [0m│
├────────┼────────────┼───────────┼───────────┼───────────┼───────────┤
│[1m Values [0m│        1.0 │  0.605809 │  0.461062 │  0.330628 │  0.234597 │
│[1m  Alpha [0m│   0.227233 │    0.2585 │    0.2342 │    0.1296 │ 0.0667667 │
└────────┴────────────┴───────────┴───────────┴───────────┴───────────┘
┌───────────────────────────────────┬──────────┬────────────┬──────────────────┬─────────────────────┬───────────┐
│[1m                                   [0m│[1m     Mean [0m│[1m   Variance [0m│[1m Mean (Parameter) [0m│[1m Variance(Parameter) [0m│[1m   Beliefs [0m│
├───────────────────────────────────┼──────────┼────────────┼──────────────────┼─────────────────────┼───────────┤
│[1m                 TYPE 1 (2

In [70]:
all_data = []
push!(all_data, values_table);
push!(all_data, hiring_table);

## Saving Data
For use in other worksheets, the estimation results are save.  They come as three files.

1. current_estimates.jld - this is a file that contains a vector of dimension $k+2K$.  The first $k-1$ entries are relative values, 
$\frac{v_2}{v_1}$  The actual values are derived from these ratios by assigning $v_1$ the value 1.  The $k^th$ value is estimated demand.  The next $K$ values are parameters that are used to derive the distributions of offer values for each tier.  This first $K$ represent the means of normal distributions which are then truncated so that the distribution fits on $[0,1]$.  The last $K$ elements are the variances of these distributions.

2. all_data.jld  All the actual values as show in the two tables above.  The values table (as used above) is the matrix all_data[1] while the hiring table (as constructed in the print funtion) is in all_data[2]


In [75]:
#save the bboptimize solution vector
s = best_candidate(sol_res);
save(files_path*"current_estimates.jld", "s", s)
save(files_path*"all_data.jld", "all_data", all_data)

In [72]:
# separately save the values and hiring tables since they are derived from the solution
open(files_path*"current_estimates_values.tex", "w") do f
pretty_table(
    f,
    all_data[1],
    header = head1,
    row_labels = row1,
    backend = Val(:latex)
    )
end

open(files_path*"current_estimates_hiring.tex", "w") do f
pretty_table(
    f,
    all_data[2],
    header = head2,
    row_labels = row2,
    backend = Val(:latex)
    )
end

In [17]:
# https://github.com/JuliaPlots/StatsPlots.jl/blob/master/README.md
# https://docs.juliaplots.org/latest/tutorial/

select_type = 1
cdfs = plot(truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1), func = cdf, title = "CDFs of Types", label = "Type 1")
for select_type in 2:NUMBER_OF_TYPES # academic types
    plot!(cdfs, truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1), func = cdf, label = string("Type ", select_type))
end

for select_type in k+1:K # sinks
    plot!(cdfs, truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1), func = cdf, label = string("Sink ", select_type - k))
end
xlabel!(cdfs, "offer value")
ylabel!(cdfs, "F(offer value)")
savefig(cdfs, "cdfs.png")
cdfs

LoadError: BoundsError: attempt to access 27-element Vector{Float64} at index [28]

In [19]:
select_type = 1
pdfs = plot(truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1), func = pdf, title = "PDFs of Types", label = "Type 1")
for select_type in 2:k # academic types
    plot!(pdfs, truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1), func = pdf, 
        label = string("Type ", select_type))
end

for select_type in k+1:K # sinks
    plot!(pdfs, truncated(Normal(sol[2k-1+select_type], sol[2k-1+select_type+K]), 0, 1), func = pdf, label = string("Sink ", select_type - k))
end
xlabel!(pdfs, "offer value")
ylabel!(pdfs, "f(offer value)")
savefig(pdfs, "pdfs.png")

LoadError: BoundsError: attempt to access 27-element Vector{Float64} at index [28]