# Worksheet 1 Assemble all the data

This worksheet assembles all the data used in the classification by calling the public api at https://support.econjobmarket.org.

By default the data is saved and can be reloaded from disk for later estimation.  This ensures that a fixed data set is used during all estimation.  The data should be reloaded from the api when a new estimation round is being done.

Yet to be done: generated labels that identify the data set that was used for each step of the estimation.

This is the first in a series of files that can be run to get both the classification and the value estimates.

In [1]:
using Dates,JLD, DotEnv


In [2]:
# only need to run this to refresh data
# all the estimation stuff relies on files that are saved here
using HTTP, JSON, PrettyTables, DotEnv
DEBUG_LEVEL = 1
function matches(keywords, phrase)
    # checks if any of the keywords are in the phrase
    for keyword in keywords
        if occursin(keyword, phrase)
            return true
        end
    end
    return false
end
cfg = DotEnv.config("../.env")
files_path = cfg["files_path"]
## set the algorithm run id to try to ensure that the saved estimation files are all consistent
algorithm_run_id = 5

5

In [3]:
placements = nothing
try
    mapinator_data = HTTP.get("https://support.econjobmarket.org/api/mapinator", timeout = 120);
    placements = JSON.parse(String(mapinator_data.body));
catch e
    error("Failed to retrieve data from the API: $e")
end

29959-element Vector{Any}:
 Dict{String, Any}("disappeared" => 0, "to_shortname" => "Bates White", "created_at" => "2024-06-06 11:33:28", "to_name" => "Bates White", "to_department" => "All departments", "to_latitude" => 38.9030567, "name" => "Econometrics", "category_id" => 2, "from_shortname" => "Economics, UCLA", "to_oid" => 3070…)
 Dict{String, Any}("disappeared" => 1, "to_shortname" => "Ocean and Crow Studios Inc", "created_at" => "2024-06-27 10:14:19", "to_name" => "Ocean and Crow Studios Inc", "to_department" => "All departments", "to_latitude" => 49.2714425, "name" => "Health; Education; Welfare", "category_id" => 20, "from_shortname" => "Economics, UCLA", "to_oid" => 893…)
 Dict{String, Any}("disappeared" => 0, "to_shortname" => "App Econ, U Autònoma Barcelona", "created_at" => "2024-07-04 19:30:12", "to_name" => "Universitat Autònoma de Barcelona", "to_department" => "Department of Applied Economics", "to_latitude" => 41.50174815758906, "name" => "Labor; Demographic Economics

In [4]:
##  this version incorporates all the ocean and crow outcomes
applicant_outcomes = Dict{Any, Vector}()
applicant_ids = Set{Any}()
num_outcomes_selected = 0

for outcome in placements
    push!(applicant_ids, outcome["aid"])
    push!(get!(applicant_outcomes, outcome["aid"], Vector()), outcome)
    if outcome["to_oid"] == 893
        num_outcomes_selected += 1
    end
end

if DEBUG_LEVEL > 0
    println("  ", length(placements), " total placement outcomes")
    println("  ", num_outcomes_selected, " outcomes at Ocean and Crow")
    println()
    println("  ", length(applicant_ids), " total applicants with placements")
end

  29959 total placement outcomes
  4548 outcomes at Ocean and Crow

  21249 total applicants with placements


In [5]:
# take out placements where ocean and crow is the graduating institution
# take out some other bad stuff
postdoc_counter = 0
fake_aid_counter = 0
finalized_applicant_outcomes = Dict{Any, Any}()

for applicant_id in keys(applicant_outcomes)
    for outcome in applicant_outcomes[applicant_id]
        # if you wish to display postdocs in the sinks, remove the if statement condition 
        #   and set Post-Doc to have higher priority than Assistant Professor below
        # alternatively, to only include postdocs in the sinks that did not receive professorships,
        #   do not alter the below code, and instead conduct a second pass 
        #   to fill in postdoc outcomes for individuals with no professorships
        if outcome["from_institution_id"] == "754"
            fake_aid_counter += 1
        else
            if !haskey(finalized_applicant_outcomes, applicant_id)
                # just add the outcome if the applicant doesn't have any yet
                finalized_applicant_outcomes[applicant_id] = outcome
            else
                # otherwise, the applicant does have at least one other outcome
                if outcome["startdate"] < finalized_applicant_outcomes[applicant_id]["startdate"]
                    # take the earliest outcome of the two and ignore the other
                    finalized_applicant_outcomes[applicant_id] = outcome
                elseif outcome["startdate"] == finalized_applicant_outcomes[applicant_id]["startdate"]
                    # sometimes we may have multiple outcomes that started on the same date - follow priority listing
                    if outcome["position_name"] in ["Assistant Professor", "Post-Doc"]
                        finalized_applicant_outcomes[applicant_id] = outcome
                    elseif outcome["position_name"] in ["Consultant"] && !(finalized_applicant_outcomes[applicant_id]["position_name"] in ["Assistant Professor"])
                        finalized_applicant_outcomes[applicant_id] = outcome
                    elseif outcome["position_name"] in ["Other Academic", "Other Non-Academic"] && !(finalized_applicant_outcomes[applicant_id]["position_name"] in ["Assistant Professor", "Consultant"])
                        finalized_applicant_outcomes[applicant_id] = outcome
                    end
                end
            end
        
        end
    end
end

if DEBUG_LEVEL > 0
    println(length(applicant_outcomes), " outcomes in all")
    println("  -", fake_aid_counter, " applicants whose graduating institution can't be detected")
    println("  ", length(finalized_applicant_outcomes), " total applicants ported to finalized collection")
end

21249 outcomes in all
  -24 applicants whose graduating institution can't be detected
  21226 total applicants ported to finalized collection


In [6]:
#finalized_applicant_outcomes

In [7]:
sorted_by_year = Dict{Any, Dict}()
removed_year_placed = 0

remove_years = [] # remove all 2022+ entries
# "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", 
#"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019",
# "2021", "2022", "2023", "2024", "2025", "2026"

for applicant_id in copy(keys(finalized_applicant_outcomes))
    outcome = finalized_applicant_outcomes[applicant_id]
    if matches(remove_years, outcome["startdate"])
        removed_year_placed += 1
        delete!(finalized_applicant_outcomes, applicant_id)
    else
        push!(get!(sorted_by_year, parse(Int, split(outcome["startdate"], "-")[1]), Dict()), applicant_id => outcome)
    end
end

if DEBUG_LEVEL > 0
    println("  -", removed_year_placed, " applicants removed due to placement in 2022/2023/2024")
    println("  ", length(finalized_applicant_outcomes), " applicants remaining after year corrections")
    println()
end

#for key in sort(collect(keys(sorted_by_year)))
#    println("Year ", key, " has ", length(sorted_by_year[key]), " placement outcomes")
#end

  -0 applicants removed due to placement in 2022/2023/2024
  21226 applicants remaining after year corrections



In [8]:
# add "Lecturer" if adjusting sinks later on
valid_labels = Set(["Assistant Professor", "Consultant", "Other Academic", "Other Non-Academic", "Lecturer", "Post-Doc"])
irrelevant_counter = 0
removed_labels = Set()

for applicant_id in copy(keys(finalized_applicant_outcomes))
    outcome = finalized_applicant_outcomes[applicant_id]
    if !(outcome["position_name"] in valid_labels)
        push!(removed_labels, outcome["position_name"])
        delete!(finalized_applicant_outcomes, applicant_id)
        irrelevant_counter += 1
    end
end

if DEBUG_LEVEL > 0
    println("  -", irrelevant_counter, " irrelevant applicants removed from the following classes of positions:")
    println(removed_labels)
    println("  ", length(finalized_applicant_outcomes), " applicants remaining after irrelevant-position applicants removed:")
    maintained_labels = Dict{Any, Int}()
    for applicant_id in keys(finalized_applicant_outcomes)
        outcome = finalized_applicant_outcomes[applicant_id]
        position_name = outcome["position_name"]
        if haskey(maintained_labels, position_name)
            maintained_labels[position_name] += 1
        else
            maintained_labels[position_name] = 1
        end
    end
end

println(maintained_labels, " ", sum(values(maintained_labels)), " total")

  -1079 irrelevant applicants removed from the following classes of positions:
Set(Any["Tenured Professor", "Untenured Professor", "Associate Professor", "Assistant, Associate or Full Professor", "Professor Any Level", "Temporary Lecturer", "Full Professor", "Visiting Professor/Lecturer/Instructor", "Assistant or Associate Professor"])
  20147 applicants remaining after irrelevant-position applicants removed:
Dict{Any, Int64}("Other Non-Academic" => 3926, "Consultant" => 653, "Other Academic" => 1496, "Post-Doc" => 3722, "Assistant Professor" => 9378, "Lecturer" => 972) 20147 total


In [9]:
sorted_by_year = Dict{Any, Dict}()
removed_year_placed = 0

remove_years = [] # remove all 2022+ entries
# "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019",
# "2021", "2022", "2023", "2024", "2025", "2026"

for applicant_id in copy(keys(finalized_applicant_outcomes))
    outcome = finalized_applicant_outcomes[applicant_id]
    if matches(remove_years, outcome["startdate"])
        removed_year_placed += 1
        delete!(finalized_applicant_outcomes, applicant_id)
    else
        push!(get!(sorted_by_year, parse(Int, split(outcome["startdate"], "-")[1]), Dict()), applicant_id => outcome)
    end
end

if DEBUG_LEVEL > 0
    println("  -", removed_year_placed, " applicants removed due to placement in 2022/2023/2024")
    println("  ", length(finalized_applicant_outcomes), " applicants remaining after year corrections")
    println()
end

#for key in sort(collect(keys(sorted_by_year)))
#    println("Year ", key, " has ", length(sorted_by_year[key]), " placement outcomes")
#end

  -0 applicants removed due to placement in 2022/2023/2024
  20147 applicants remaining after year corrections



In [10]:
year_labels = []
placements_by_year = Dict()

for key in sort(collect(keys(sorted_by_year)))
    push!(year_labels, key)
    push!(placements_by_year,(key =>length(sorted_by_year[key])))
end
#println(year_labels)
pretty_table(placements_by_year, sortkeys = true, backend = Val(:latex))
open(files_path*"placements_by_year.tex", "w") do f
    pretty_table(
        f,
        placements_by_year,
        sortkeys = true,
        backend = Val(:latex)
    )
end

\begin{tabular}{rr}
  \hline
  \textbf{Keys} & \textbf{Values} \\
  \texttt{Any} & \texttt{Any} \\\hline
  2001 & 942 \\
  2003 & 36 \\
  2004 & 30 \\
  2005 & 275 \\
  2006 & 33 \\
  2007 & 227 \\
  2008 & 168 \\
  2009 & 361 \\
  2010 & 377 \\
  2011 & 453 \\
  2012 & 517 \\
  2013 & 559 \\
  2014 & 639 \\
  2015 & 1092 \\
  2016 & 2106 \\
  2017 & 1362 \\
  2018 & 1419 \\
  2019 & 1870 \\
  2020 & 1879 \\
  2021 & 1827 \\
  2022 & 1716 \\
  2023 & 1716 \\
  2024 & 530 \\
  2025 & 13 \\\hline
\end{tabular}


In [11]:
include("functions/type_allocation_flexible.jl")
included_years = keys(sorted_by_year)
# auto-detect year interval; change this to select the years of data to include in the estimation
YEAR_INTERVAL = minimum(included_years):maximum(included_years)

2001:2025

In [12]:
to_from_by_year_api = Dict()
for key in keys(sorted_by_year)
    to_from_by_year_api[string(key)] = sorted_by_year[key]
end
# alternatively, load an existing file of placement data
# to_from_by_year_api = SBM_flexible.fetch_data("to_from_by_year.json")

In [13]:
#mkpath(".estimates")
open(files_path*"to_from_by_year.json", "w") do f
    write(f, JSON.json(to_from_by_year_api))
end;