# Validate continents against grapher

Make sure continents in our `countries_regions.csv` file are consistent with continent definitions in grapher.

In [None]:
from init import *

## Load all data

In [None]:
# continents from `country_name_tool_countrydata` table used as an input for `countries-regions` pipeline
q = """
select
    cd.owid_name,
    cd.iso_alpha3,
    ct.continent_name
from country_name_tool_countrydata as cd
join country_name_tool_continent as ct on cd.continent = ct.id
"""
cf = pd.read_sql(q, engine)

In [None]:
# continents from grapher
q = """
select
  dv.value as continent,
  e.code as country_code,
  e.name as country_name
from data_values as dv
join entities as e on dv.entityId = e.id
where dv.variableId = 123
"""
gf = pd.read_sql(q, engine)

In [None]:
# continents from countries_regions.csv
import pandas as pd
from owid import catalog

from etl.paths import DATA_DIR

reference_dataset = catalog.Dataset(DATA_DIR / "reference")
countries_regions = reference_dataset["countries_regions"]

## Difference between `country_name_tool_countrydata` and grapher

In [None]:
cf["owid_continent"] = cf.iso_alpha3.map(gf.set_index("country_code").continent)
diffs = cf[cf.continent_name != cf.owid_continent].dropna(subset=["owid_continent"])

# there should be no differences!
if len(diffs) != 0:
    print(diffs)
    raise Exception()

## Difference between `countries_regions` and grapher

Some of those differences are due to countries unassigned to continents in `countries_regions.csv`. Latest output

```
Africa
countries_regions.csv - grapher: set()
grapher - countries_regions.csv: {'OWID_SML', 'OWID_ERE'}

Antarctica
countries_regions.csv - grapher: {'SGS'}
grapher - countries_regions.csv: {'ATA'}

Asia
countries_regions.csv - grapher: set()
grapher - countries_regions.csv: {'OWID_CYN', 'OWID_SOS', 'OWID_NAG', 'OWID_AKD', 'OWID_ABK', 'OWID_KRU'}

Europe
countries_regions.csv - grapher: {'OWID_CYN'}
grapher - countries_regions.csv: {'OWID_YGS', 'OWID_USS', 'OWID_SRM', 'OWID_TRS', 'OWID_SEK'}

North America
countries_regions.csv - grapher: set()
grapher - countries_regions.csv: set()

Oceania
countries_regions.csv - grapher: {'OWID_MNS', 'OWID_PYA'}
grapher - countries_regions.csv: set()

South America
countries_regions.csv - grapher: {'OWID_NLC'}
grapher - countries_regions.csv: set()
```

In [None]:
import json

for continent, df in gf.groupby("continent"):
    cr_countries = json.loads(
        countries_regions[countries_regions.name == continent].iloc[0].members
    )

    print(continent)
    print("countries_regions.csv - grapher:", set(cr_countries) - set(df.country_code))
    print("grapher - countries_regions.csv:", set(df.country_code) - set(cr_countries))
    print()