In [1]:
# 安装环境
using Pkg
Pkg.activate(@__DIR__)
Pkg.instantiate()
using CSV, LightXML, DataFrames, PyCall

[32m[1m  Activating[22m[39m project at `~/Code/ucas/论文/雪冰科学数据中心`


In [2]:
df = CSV.read(joinpath(@__DIR__, "data", "snow_ice_data_mo.csv"), DataFrame)
metas = df[!, :metadata];

In [3]:
@pyimport deep_translator

function cn(text)
    if text == "International"
        "国际"
    elseif text == "European Union"
        "欧盟"
    elseif text == "National Cryosphere Desert Data Center"
        "国家冰川冻土沙漠科学数据中心"
    else
        deep_translator.GoogleTranslator(source="auto", target="zh-CN").translate(text)
    end
end

cn (generic function with 1 method)

In [4]:
@pyimport pycountry

function country(alpha_3)
    if alpha_3 == "AAA"
        "International"
    elseif alpha_3 == "EEC"
        "European Union"
    else
        pycountry.countries.get(alpha_3=alpha_3).name
    end
end

country (generic function with 1 method)

In [5]:
infos = []

for meta in metas
    # 名称、网址、所属机构、所在国家或国际组织、数据中心简介、特色及特有资源。
    xml = parse_string(meta)
    xml_root = root(xml)
    repo = find_element(xml_root, "repository")
    orgIdentifier = content(find_element(repo, "re3data.orgIdentifier"))
    repositoryName = cn(content(find_element(repo, "repositoryName")))
    repositoryURL = content(find_element(repo, "repositoryURL"))

    println(repositoryName)

    # 国家、机构相关数据
    institutions_array = []
    countries_array = []
    institutions_xml = get_elements_by_tagname(repo, "institution")

    for inst in institutions_xml
        institutionName = content(find_element(inst, "institutionName"))
        institutionCountry = country(content(find_element(inst, "institutionCountry")))

        responsibilityType_array = []
        responsibilityType_xml = get_elements_by_tagname(inst, "responsibilityType")

        for responsibilityType in responsibilityType_xml
            # print(responsibilityType)
            responsibilityType = content(responsibilityType)
            push!(responsibilityType_array, responsibilityType)
        end

        responsibilityTypes = join(responsibilityType_array, "; ")

        inst = "$institutionName($responsibilityTypes)"
        push!(institutions_array, inst)
        push!(countries_array, institutionCountry)
    end

    countryCn = cn(countries_array[1])
    institutions = cn(join(institutions_array, "; "))
    institutionCount = length(institutions_array)

    # 政策相关数据
    policies_array = []
    policy_xml = get_elements_by_tagname(repo, "policy")

    for policy in policy_xml
        policyURL = content(find_element(policy, "policyURL"))
        push!(policies_array, policyURL)
    end

    policies = join(unique(policies_array), "; ")
    policyCount = length(policies_array)

    # 数据内容相关数据
    entryDate = content(find_element(repo, "entryDate"))
    lastUpdate= content(find_element(repo, "lastUpdate"))
    contentType_array = []
    contentType_xml = get_elements_by_tagname(repo, "contentType")

    for contentType in contentType_xml
        contentType = content(contentType)
        push!(contentType_array, contentType)
    end

    contentTypes = cn(join(unique(contentType_array), "; "))
    contentTypeCount = length(contentType_array)

    # 数据共享技术
    pidSystem_array = []
    pidSystem_xml = get_elements_by_tagname(repo, "pidSystem")

    for pidSystem in pidSystem_xml
        pidSystem = content(pidSystem)
        push!(pidSystem_array, pidSystem)
    end

    pidSystems = cn(join(unique(pidSystem_array), "; "))
    pidSystemCount = length(pidSystem_array)

    api_array = []
    api_xml = get_elements_by_tagname(repo, "api")

    for api in api_xml
        api=attribute(api, "apiType")
        push!(api_array, api)
    end

    apis = join(unique(api_array), "; ")
    apiCount = length(api_array)

    # 数据访问许可
    dataAccess_array = []
    dataAccess_xml = get_elements_by_tagname(repo, "dataAccess")

    for dataAccess in dataAccess_xml
        dataAccess = content(dataAccess)
        push!(dataAccess_array, dataAccess)
    end

    dataAccesses = cn(join(unique(dataAccess_array), "; "))
    dataAccessCount = length(dataAccess_array)

    # 将元数据添加到数组中
    push!(infos, (
        orgIdentifier=orgIdentifier,
        name=repositoryName,
        repositoryURL=repositoryURL,
        country=countryCn,
        institutions=institutions,
        institutionCount=institutionCount,
        policies=policies,
        policyCount=policyCount,
        entryDate=entryDate,
        lastUpdate=lastUpdate,
        contentTypes=contentTypes,
        contentTypeCount=contentTypeCount,
        pidSystems=pidSystems,
        pidSystemCount=pidSystemCount,
        apis=apis,
        apiCount=apiCount,
        dataAccesses=dataAccesses,
        dataAccessCount=dataAccessCount
    ))
    # break
end
infos
# text
# 将数组写入csv文件
CSV.write(joinpath(@__DIR__, "data", "snow_ice_policy.csv"), infos);

国家冰雪数据中心


剑桥世界冰川学数据中心


国家冰川冻土沙漠科学数据中心


国家极地研究所科学数据库


综合气候数据中心


NOAA 国家环境信息中心 - 古气候数据


美国南极计划数据中心
