In [None]:
from autumn.core.inputs.database import get_input_db
import pycountry
import os
import pandas as pd
from autumn.settings.folders import DATA_PATH
from datetime import datetime

In [None]:

input_db = get_input_db()

In [None]:
manual_map = {
    "Democratic Republic of the Congo": "COD",
    "Laos": "LAO",
    "U.S. Virgin Islands": "VIR",
    "Bonaire": "BOS",
    "Curacao": "CUW",
    "Iran": "IRN",
    "Reunion": "REU",
    "Sint Maarten": "MAF",
    "Taiwan": "TWN",
    "USA": "USA"
}

def get_iso3(country):
    if country in manual_map:
        return manual_map[country]


    country_objects = pycountry.countries.search_fuzzy(country)
    if len(country_objects) == 1:
        return country_objects[0].alpha_3
    else:
        print(country)
        return pycountry.countries.get(name=country).alpha_3

### Excluded countries

In [None]:
excluded_countries = ["Canary Islands", "Crimea"]

# Population size by age (n=201)

In [None]:
pop = input_db.query(
    table_name='population', 
    conditions= {"year": 2020, "region": None}
)
pop_iso3s = pop.iso3.unique()
print(len(pop_iso3s))

## GISAID (n=204)

In [None]:

gisaid_countries = input_db.query(
    table_name='gisaid', 
    # conditions= {"iso_code": "AUS"}
).Country.unique()
gisaid_countries = [c for c in gisaid_countries if c not in excluded_countries]
gisaid_iso3s = [get_iso3(c) for c in gisaid_countries]

gisaid_iso3s = list(dict.fromkeys(gisaid_iso3s)) # remove duplicates


In [None]:
len(gisaid_iso3s)

## UNESCO (N=210)

In [None]:
input_db.table_names()

In [None]:
unesco_iso3s = list(input_db.query(
     table_name='school_closure', 
    # conditions= {"iso_code": "AUS"}
).country_id.unique())


In [None]:
len(unesco_iso3s)

## SeroTracker
Using both national and sub-national estimates but tagging those that are national

In [None]:
sero_data = input_db.query(
    table_name='sero-survey', 
    conditions= {}
)  # database updated 27 Mar 2023

In [None]:
sero_data.end_date = pd.to_datetime(sero_data.sampling_end_date, format="%Y/%m/%d")

In [None]:
# Eligibility filters
unity_filter = sero_data["is_unity_aligned"] == "Unity-Aligned"
size_filter = sero_data["denominator_value"] >= 599
date_filter = sero_data.end_date <= datetime(2021, 5, 1)

# Pick the primary estimate (always available)
subgroup_filter = sero_data.subgroup_var == "Primary Estimate"

# National-level filter
national_filter = sero_data["estimate_grade"] == "National"

In [None]:
# More eligibility filters using "red flags"
red_flags = [
   "210109_Australia_AustralianNationalUniversity",  # "Prevalence of asymptomatic"
   "220131_Australia_UniversityOfSydney_Antenatal",  # "only in pregnant women"
   "210121_BarrioMugica_MinistryOfHealthOfTheCityOfBuenosAires",  # focusing on slum population
   "211211_Vietnam_UniversityofSydney_TienSubCommune",  # quarantine workers
   "211211_Vietnam_UniversityofSydney_BacMaSubCommune",  # quarantine workers
]
redflag_filter = ~sero_data["study_name"].isin(red_flags)

In [None]:
filtered_sero_data = sero_data[unity_filter & size_filter & date_filter & subgroup_filter & redflag_filter]
filtered_sero_data_national = sero_data[unity_filter & size_filter & date_filter & national_filter & subgroup_filter & redflag_filter] 

In [None]:
SeroTracker_iso3s = list(filtered_sero_data['alpha_3_code'].unique())
SeroTracker_iso3s_national = list(filtered_sero_data_national['alpha_3_code'].unique())

# Intercept

In [None]:
included_iso3s = [iso3 for iso3 in gisaid_iso3s if iso3 in pop_iso3s and iso3 in unesco_iso3s and iso3 in SeroTracker_iso3s]
included_iso3s_national = [iso3 for iso3 in gisaid_iso3s if  iso3 in pop_iso3s and iso3 in unesco_iso3s and iso3 in SeroTracker_iso3s_national]
print(len(included_iso3s))
print(len(included_iso3s_national))

### Apply preferential filters to select one sero estimate per country and export the results as csv files 

In [None]:
for level in ["national", "subnational"]:
    if level == "national":
        iso3_list = included_iso3s_national
        full_data = filtered_sero_data_national
    else:
        iso3_list = [iso3 for iso3 in included_iso3s if iso3 not in included_iso3s_national]
        full_data = filtered_sero_data

    output_data = pd.DataFrame(columns=full_data.columns)

    for iso3 in iso3_list:
        country_data = full_data[full_data['alpha_3_code'] == iso3]
        
        # we prefer to use data with the lowest risk of bias possible 
        country_data = country_data.replace(
            ["['Low']", "['Moderate']", "['High']"], 
            [2, 1, 0]
        )        
        country_data = country_data[country_data['overall_risk_of_bias'] == max(country_data['overall_risk_of_bias'])]

        # we prioritise the largest sample size
        country_data = country_data[country_data['denominator_value'] == max(country_data['denominator_value'])]
        
        assert len(country_data) == 1, "More than one estimates found"

        output_data = pd.concat([output_data, country_data])
    
    output_data.to_csv(f"serodata_{level}.csv")

## Make a map

### All countries, including sub-national sero-surveys

In [None]:
import plotly.express as px

print(f"N counries included: {len(included_iso3s)}")

df = pd.DataFrame({'country':included_iso3s, 'value': [1.]*len(included_iso3s)})
fig = px.choropleth(df, locations="country",
                    color="value", # lifeExp is a column of gapminder
                    # hover_name="country", # column to add to hover information
) #color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

### Only including national sero-surveys

In [None]:
import plotly.express as px
print(f"N counries included: {len(included_iso3s_national)}")
df = pd.DataFrame({'country':included_iso3s_national, 'value': [1.]*len(included_iso3s_national)})
fig = px.choropleth(df, locations="country",
                    color="value", # lifeExp is a column of gapminder
                    # hover_name="country", # column to add to hover information
) #color_continuous_scale=px.colors.sequential.Plasma)
fig.show()

### Dump list of included countries

In [None]:
included_countries = [pycountry.countries.get(alpha_3=iso3).name for iso3 in included_iso3s]
included_countries_national = [pycountry.countries.get(alpha_3=iso3).name for iso3 in included_iso3s_national]

In [None]:
included_dict = {
    "all": {iso3: country_name for (iso3, country_name) in zip(included_iso3s, included_countries)},
    "national":  {iso3: country_name for (iso3, country_name) in zip(included_iso3s_national, included_countries_national)}
}

In [None]:
import yaml

with open('included_countries.yml', 'w') as outfile:
    yaml.dump(included_dict, outfile, default_flow_style=False)