In [1]:
"""
SBM API Data Filter (Julia Version)
Adapted from James Yuming Yu (5 June 2023)

Silas Kwok, 31 July 2023
"""

"SBM API Data Filter (Julia Version)\nAdapted from James Yuming Yu (5 June 2023)\n\nSilas Kwok, 31 July 2023\n"

In [2]:
#Pkg.add("ConfigEnv")
#Pkg.add("DotEnv")
#Pkg.add("HTTP")
#Pkg.add("URIs")
#Pkg.add("JSON")
#Pkg.add("JSON")
#Pkg.add("Dates")
#Pkg.add("DataStructures")

using Pkg, DotEnv, HTTP, URIs, JSON, Dates, DataStructures

In [3]:
function matches(keywords, phrase)
    # checks if any of the keywords are in the phrase
    for keyword in keywords
        if occursin(keyword, phrase)
            return true
        end
    end
    return false
end

matches (generic function with 1 method)

In [4]:
mapinator_data = HTTP.get("https://support.econjobmarket.org/api/mapinator", timeout = 120)
placements = JSON.parse(String(mapinator_data.body))

n = 0
for p in placements
    if n < 1
        println("........")
        for(key, value) in p
            println(key, " => ", value)
        end
        println("........")
    end
    n += 1
end

........
to_shortname => Economics, Tsinghua University
created_at => 2019-07-02 15:40:46
to_name => Tsinghua University
to_department => Department of Economics, School of Economics and Management
to_latitude => 39.999567
name => Development; Growth
category_id => 1
from_shortname => Economics, Univ of Michigan
to_oid => 623
description => Academic organization (economics department)
startdate => 2010-01-01
recruiter_type => 1
from_oid => 186
position_name => Associate Professor
rank => 14
created_by => 10
aid => 2121
to_institution_id => 535
postype => 2
latitude => 42.2738566
to_longitude => 116.331991
from_institution_id => 4173
to_rank => 86
longitude => -83.7375824
from_department => Department of Economics
from_institution_name => University of Michigan
........


In [5]:
function process_data(DEBUG_LEVEL)
    # STEP 1a: retrieve the placement outcomes
    # NOTE: request times out after 120 seconds. If the data takes longer than 120s to download, adjust the timeout.
    try
        mapinator_data = HTTP.get("https://support.econjobmarket.org/api/mapinator", timeout = 120)
        placements = JSON.parse(String(mapinator_data.body))
    catch e
        error("Failed to retrieve data from the API: $e")
    end

    # STEP 1b: group placements by applicant ID and eliminate "oid 893" positions (Ocean and Crow)
    # TODO: are the json fields strictly typed? is there a way to easily compensate if the variable types change?

    applicant_outcomes = Dict{Any, Vector}()
    applicant_ids = Set{Any}()
    num_outcomes_selected = 0

    for outcome in placements
        push!(applicant_ids, outcome["aid"])
        if outcome["to_oid"] != 893
            push!(get!(applicant_outcomes, outcome["aid"], Vector()), outcome)
            num_outcomes_selected += 1
        end
    end

    if DEBUG_LEVEL > 0
        println("  ", length(placements), " total placement outcomes")
        println("- ", length(placements) - num_outcomes_selected, " outcomes at Ocean and Crow")
        println("  ", num_outcomes_selected, " outcomes not at Ocean and Crow")
        println()
        println("  ", length(applicant_ids), " total applicants with placements")
        println("- ", length(applicant_ids) - length(applicant_outcomes), " total applicants with exclusively outcomes at Ocean and Crow")
        println("  ", length(applicant_outcomes), " applicants with outcomes not at Ocean and Crow")
        println()
    end
    # --------------------------------------------------------------------------------------------------------------------------------------------

    # STEP 2a: determine the first placement outcome of each individual that occurred after the individual graduated
    # we need to know what the first outcome is BEFORE we filter on types of outcomes, as otherwise we will get incorrectly-identified "first-time positions"

    # STEP 2b: remove postdoc outcomes so applicants with postdoc positions aren't automatically removed from the data
    # postdocs are concurrent so the placements are redundant on top of e.g. concurrently-awarded assistant professor positions

    postdoc_counter = 0
    finalized_applicant_outcomes = Dict{Any, Any}()

    for applicant_id in keys(applicant_outcomes)
        for outcome in applicant_outcomes[applicant_id]
            if outcome["position_name"] != "Post-Doc"
                if !haskey(finalized_applicant_outcomes, applicant_id)
                    # just add the outcome if the applicant doesn't have any yet
                    finalized_applicant_outcomes[applicant_id] = outcome
                else
                    # otherwise, the applicant does have at least one other outcome
                    if outcome["startdate"] < finalized_applicant_outcomes[applicant_id]["startdate"]
                        # take the earliest outcome of the two and ignore the other
                        finalized_applicant_outcomes[applicant_id] = outcome
                    elseif outcome["startdate"] == finalized_applicant_outcomes[applicant_id]["startdate"]
                        # sometimes we may have multiple outcomes that started on the same date - follow priority listing
                        if outcome["position_name"] in ["Assistant Professor"]
                            finalized_applicant_outcomes[applicant_id] = outcome
                        elseif outcome["position_name"] in ["Consultant"] && !(finalized_applicant_outcomes[applicant_id]["position_name"] in ["Assistant Professor"])
                            finalized_applicant_outcomes[applicant_id] = outcome
                        elseif outcome["position_name"] in ["Other Academic", "Other Non-Academic"] && !(finalized_applicant_outcomes[applicant_id]["position_name"] in ["Assistant Professor", "Consultant"])
                            finalized_applicant_outcomes[applicant_id] = outcome
                        end
                    end
                end
            else
                postdoc_counter += 1
            end
        end
    end

    if DEBUG_LEVEL > 0
        println("- ", length(applicant_outcomes) - length(finalized_applicant_outcomes), " total applicants removed due to only being postdocs (", 
            postdoc_counter, " total postdoc placements detected)")
        println("  ", length(finalized_applicant_outcomes), " total applicants ported to finalized collection")
        println()
    end

    # --------------------------------------------------------------------------------------------------------------------------------------------

# STEP 3: eliminate everything except:
    # - Assistant Professor
    # - Consultant
    # - Other Academic
    # - Other Non-Academic
    
    valid_labels = Set(["Assistant Professor", "Consultant", "Other Academic", "Other Non-Academic"])

    irrelevant_counter = 0
    removed_labels = Set()

    for applicant_id in copy(keys(finalized_applicant_outcomes))
        outcome = finalized_applicant_outcomes[applicant_id]
        if !(outcome["position_name"] in valid_labels)
            push!(removed_labels, outcome["position_name"])
            delete!(finalized_applicant_outcomes, applicant_id)
            irrelevant_counter += 1
        end
    end
    
    if DEBUG_LEVEL > 0
        println("- ", irrelevant_counter, " irrelevant applicants removed from the following classes of positions:")
        println(removed_labels)
        println("  ", length(finalized_applicant_outcomes), " applicants remaining after irrelevant-position applicants removed:")
        maintained_labels = Dict{Any, Int}()
        for applicant_id in keys(finalized_applicant_outcomes)
            outcome = finalized_applicant_outcomes[applicant_id]
            position_name = outcome["position_name"]
            if haskey(maintained_labels, position_name)
                maintained_labels[position_name] += 1
            else
                maintained_labels[position_name] = 1
            end
        end
    end

    println(maintained_labels, " ", sum(values(maintained_labels)), " total")
    println()

    
    # --------------------------------------------------------------------------------------------------------------------------------------------

    # STEP 4: filter-by-year

    sorted_by_year = Dict{Any, Dict}()
    removed_year_placed = 0

    for applicant_id in copy(keys(finalized_applicant_outcomes))
        outcome = finalized_applicant_outcomes[applicant_id]
        # remove all 2022+ entries
        if matches(["2022", "2023", "2024", "2025", "2026"], outcome["startdate"])
            removed_year_placed += 1
            delete!(finalized_applicant_outcomes, applicant_id)
        else
            push!(get!(sorted_by_year, parse(Int, split(outcome["startdate"], "-")[1]), Dict()), applicant_id => outcome)
        end
    end


    if DEBUG_LEVEL > 0
        println("- ", removed_year_placed, " applicants removed due to placement in 2022/2023/2024")
        println("  ", length(finalized_applicant_outcomes), " applicants remaining after year corrections")
        println()
    end

    result = Dict{Any, Any}()

    for key in sort(collect(keys(sorted_by_year)))
        println("Year ", key, " has ", length(sorted_by_year[key]), " placement outcomes")
        result[key] = sorted_by_year[key]
    end

    println()

    # --------------------------------------------------------------------------------------------------------------------------------------------

    # STEP 5: save to disk

    total_check = sum(length(value) for value in values(sorted_by_year))
    println("Total " * "$total_check" * " applicants in JSON file (compare to" * " $(length(finalized_applicant_outcomes)) " 
        * "applicants in finalized_applicant_outcomes: " * "$(length(finalized_applicant_outcomes) == total_check ? "SUCCESS" : "FAIL")" * ")")

    json_str = JSON.json(Dict(result), 4)  
    open("to_from_by_year_api.json", "w") do f
        write(f, json_str)
    end
    
    # --------------------------------------------------------------------------------------------------------------------------------------------
end

process_data (generic function with 1 method)

In [6]:
function main(args)
    # DEBUG LEVEL 0: no debug
    # DEBUG LEVEL 1: basic counter printouts
    DEBUG_LEVEL = 1
    if length(args) > 1 && isnumeric(parse(Int, args[2])) && parse(Int, args[2]) in 0:1
        DEBUG_LEVEL = parse(Int, args[2])
    end

    process_data(DEBUG_LEVEL)
end

main (generic function with 1 method)

In [7]:
main(ARGS)


  25521 total placement outcomes
- 4331 outcomes at Ocean and Crow
  21190 outcomes not at Ocean and Crow

  18377 total applicants with placements
- 3007 total applicants with exclusively outcomes at Ocean and Crow
  15370 applicants with outcomes not at Ocean and Crow

- 1737 total applicants removed due to only being postdocs (3335 total postdoc placements detected)
  13633 total applicants ported to finalized collection

- 1767 irrelevant applicants removed from the following classes of positions:
Set(Any["Tenured Professor", "Untenured Professor", "Associate Professor", "Assistant, Associate or Full Professor", "Full Professor", "Professor Any Level", "Temporary Lecturer", "Visiting Professor/Lecturer/Instructor", "Assistant or Associate Professor", "Lecturer"])
  11866 applicants remaining after irrelevant-position applicants removed:
Dict{Any, Int64}("Other Non-Academic" => 2449, "Consultant" => 524, "Other Academic" => 1242, "Assistant Professor" => 7651) 11866 total

- 1271 ap

11998999