## Add entity codes and legacy ids from mysql entities table

In [1]:
from init import *

Populating the interactive namespace from numpy and matplotlib


In [2]:
countries = pd.read_feather("intermediate/02-countries-with-wikidata-ids.feather")

### Retrieve mysql Entity entries that are relevant and merge them

For merging legacy data it would be useful to have the entity ids in the country table. The entity table also includes a code that might be what we want to use for the primary key (iso alpha 3 by default but extending it with OIWD_WRL e.g. for "World").

We use two mysql queries to fetch the relevant information. The first, straightforward one is to just get all entities that have a `code` assigned. This happen to be practically all countries, territories and some historical countries that we have in the country table, plus world. 

The other query we use is to a second way we use to catch oversights in the former appraoch (missing owid code for legitimate territories). We use the Gapminder population dataset (Id 72) since that is used ubiquotously and references most countries and territorries.

In [3]:
entities_with_code = pd.read_feather("intermediate/entities-with-code.feather")

In [4]:
entities_with_code.shape

In [5]:
entities_from_population = pd.read_feather("intermediate/entities-from-population.feather")

In [6]:
entities_from_population.shape

In [7]:
entities_merged = pd.merge(entities_with_code, entities_from_population, how="outer")

In [8]:
entities_merged.shape

In [9]:
entities_merged[entities_merged.code.isnull()]

Unnamed: 0,id,code,name
289,273,,Africa
290,275,,Asia
291,276,,Europe
292,294,,North America
293,277,,Oceania
294,60094,,Saint Barthlemy
295,295,,South America


In [10]:
countries[countries.owid_name.str.contains("Barth")]

Unnamed: 0,id,owid_name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label
206,795,Saint Barthlemy,BL,BLM,,,,,,,,,1.0,http://www.wikidata.org/entity/Q25362,Saint Barthélemy


In [11]:
entities_merged[entities_merged.name.str.contains("Barth")]

Unnamed: 0,id,code,name
206,297,BLM,Saint Barthélemy
294,60094,,Saint Barthlemy


⚡ Saint Barthlemy is here twice - apparently the population dataset references a version with a spelling error - let's drop this row

In [12]:
st_barthlemy_index_delete = entities_merged[
    entities_merged.name.str.contains("Barth") & entities_merged.code.isnull()
].index[0]

In [13]:
st_barthlemy_index_delete

In [14]:
entities_merged.drop(st_barthlemy_index_delete, inplace=True)

In [15]:
entities_merged[entities_merged.name.str.contains("Barth")]

Unnamed: 0,id,code,name
206,297,BLM,Saint Barthélemy


⚡ Figure out if any other entities need to be fixed up so that we can merge

In [16]:
entities_codes = set(entities_merged.code)

In [17]:
countries_codes = set(countries.iso_alpha3)

In [18]:
entities_codes.difference(countries_codes)

In [19]:
len(entities_codes.difference(countries_codes))

💡 We need to match these 39 rows by name; afterwards we can merge by this merge key (iso_alpha3 for those that have it already, these OWID codes for the rest).

❔ Could we match on names only between countries and entities?

In [20]:
countries_merge_test = countries[["owid_name", "iso_alpha3"]]

In [21]:
countries_merge_test.shape

In [22]:
test_merge = countries_merge_test.merge(right=entities_merged, how="outer", left_on="owid_name", right_on="name")

In [23]:
test_merge.shape

In [24]:
test_merge.head()

Unnamed: 0,owid_name,iso_alpha3,id,code,name
0,Afghanistan,AFG,15.0,AFG,Afghanistan
1,Aland Islands,ALA,,,
2,Albania,ALB,16.0,ALB,Albania
3,Algeria,DZA,17.0,DZA,Algeria
4,American Samoa,ASM,246.0,ASM,American Samoa


In [25]:
test_merge[test_merge.code.isnull()]

Unnamed: 0,owid_name,iso_alpha3,id,code,name
1,Aland Islands,ALA,,,
128,Korea,,,,
157,Micronesia (region),,,,
201,Rest of the World,,,,
206,Saint Barthlemy,BLM,,,
294,,,273.0,,Africa
295,,,275.0,,Asia
296,,,276.0,,Europe
297,,,294.0,,North America
298,,,277.0,,Oceania


In [26]:
test_merge[test_merge.owid_name.isnull()]

Unnamed: 0,owid_name,iso_alpha3,id,code,name
280,,,386.0,OWID_ABK,Abkhazia
281,,,387.0,OWID_AKD,Akrotiri and Dhekelia
282,,,388.0,OWID_ERE,Eritrea and Ethiopia
283,,,389.0,OWID_NAG,Nagorno-Karabakh
284,,,297.0,BLM,Saint Barthélemy
285,,,268.0,OWID_SRM,Serbia and Montenegro
286,,,392.0,OWID_SEK,Serbia excluding Kosovo
287,,,393.0,OWID_SML,Somaliland
288,,,394.0,OWID_SOS,South Ossetia
289,,,395.0,OWID_TRS,Transnistria


💡 The only thing that doesn't work for matching by name but should is Aland Islands and Saint Barthélemy. Let's fix this and then do the merge on the full tables by name

In [27]:
entities_merged.loc[entities_merged.code == "ALA", "name"]

In [28]:
entities_merged.loc[entities_merged.code == "BLM"]

Unnamed: 0,id,code,name
206,297,BLM,Saint Barthélemy


In [29]:
countries.loc[countries.iso_alpha3 == "ALA", "owid_name"]

In [30]:
countries.loc[countries.iso_alpha3 == "BLM", "owid_name"]

In [31]:
entities_merged.loc[entities_merged.code == "ALA", "name"] = countries.loc[
    countries.iso_alpha3 == "ALA", "owid_name"
].item()

In [32]:
entities_merged.loc[entities_merged.code == "BLM", "name"] = countries.loc[
    countries.iso_alpha3 == "BLM", "owid_name"
].item()

In [33]:
entities_merged.loc[entities_merged.code == "ALA"]

Unnamed: 0,id,code,name
288,296,ALA,Aland Islands


In [34]:
entities_merged.loc[entities_merged.code == "BLM"]

Unnamed: 0,id,code,name
206,297,BLM,Saint Barthlemy


⚡ Let's do the merge on the full tables now. We do an outer merge to already pull in the continents. We'll set them up properly in a later step but we want to grab the correct entity ids already now

In [35]:
countries_with_entities = countries.merge(right=entities_merged, how="outer", left_on="owid_name", right_on="name")

In [36]:
countries_with_entities.shape

In [37]:
countries_with_entities[countries_with_entities.code.isnull()]

Unnamed: 0,id_x,owid_name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,id_y,code,name
128,832.0,Korea,,,,KOR,730.0,,,,,,2.0,,,,,
157,632.0,Micronesia (region),,,,,,,,,,,,,,,,
201,560.0,Rest of the World,,,,,,,,,,,,,,,,
292,,,,,,,,,,,,,,,,273.0,,Africa
293,,,,,,,,,,,,,,,,275.0,,Asia
294,,,,,,,,,,,,,,,,276.0,,Europe
295,,,,,,,,,,,,,,,,294.0,,North America
296,,,,,,,,,,,,,,,,277.0,,Oceania
297,,,,,,,,,,,,,,,,295.0,,South America


In [38]:
countries_with_entities[countries_with_entities.name.isnull()]

Unnamed: 0,id_x,owid_name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,id_y,code,name
128,832.0,Korea,,,,KOR,730.0,,,,,,2.0,,,,,
157,632.0,Micronesia (region),,,,,,,,,,,,,,,,
201,560.0,Rest of the World,,,,,,,,,,,,,,,,


In [39]:
countries_with_entities[countries_with_entities.owid_name.isnull()]

Unnamed: 0,id_x,owid_name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,id_y,code,name
280,,,,,,,,,,,,,,,,386.0,OWID_ABK,Abkhazia
281,,,,,,,,,,,,,,,,387.0,OWID_AKD,Akrotiri and Dhekelia
282,,,,,,,,,,,,,,,,388.0,OWID_ERE,Eritrea and Ethiopia
283,,,,,,,,,,,,,,,,389.0,OWID_NAG,Nagorno-Karabakh
284,,,,,,,,,,,,,,,,268.0,OWID_SRM,Serbia and Montenegro
285,,,,,,,,,,,,,,,,392.0,OWID_SEK,Serbia excluding Kosovo
286,,,,,,,,,,,,,,,,393.0,OWID_SML,Somaliland
287,,,,,,,,,,,,,,,,394.0,OWID_SOS,South Ossetia
288,,,,,,,,,,,,,,,,395.0,OWID_TRS,Transnistria
289,,,,,,,,,,,,,,,,270.0,OWID_USS,USSR


💡 There are some historical entities that we didn't have in the original mysql countries table but for which owid_codes were assigned. These do not have owid_names yet. The same is true for the continents. Backport the name to the owid_name field now

In [40]:
countries_with_entities.loc[countries_with_entities.owid_name.isnull(), "owid_name"] = countries_with_entities.loc[
    countries_with_entities.owid_name.isnull(), "name"
]

In [41]:
countries_with_entities[countries_with_entities.owid_name.isnull()]

Unnamed: 0,id_x,owid_name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,id_y,code,name


In [42]:
countries_with_entities[countries_with_entities.code.isnull()]

Unnamed: 0,id_x,owid_name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,id_y,code,name
128,832.0,Korea,,,,KOR,730.0,,,,,,2.0,,,,,
157,632.0,Micronesia (region),,,,,,,,,,,,,,,,
201,560.0,Rest of the World,,,,,,,,,,,,,,,,
292,,Africa,,,,,,,,,,,,,,273.0,,Africa
293,,Asia,,,,,,,,,,,,,,275.0,,Asia
294,,Europe,,,,,,,,,,,,,,276.0,,Europe
295,,North America,,,,,,,,,,,,,,294.0,,North America
296,,Oceania,,,,,,,,,,,,,,277.0,,Oceania
297,,South America,,,,,,,,,,,,,,295.0,,South America


## Clean merged dataframe and save the new version with wikidata ids

In [43]:
countries_with_entities.drop(["name"], axis=1, inplace=True)

In [44]:
countries_with_entities.rename(
    columns={
        "id_x": "legacy_country_id",
        "owid_name": "name",
        "id_y": "legacy_entity_id",
    },
    inplace=True,
)

In [45]:
countries_with_entities.shape

In [46]:
countries_with_entities.head()

Unnamed: 0,legacy_country_id,name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,legacy_entity_id,code
0,562.0,Afghanistan,AF,AFG,512.0,AFG,700.0,AFG,AF,AFGN,AFG,AFG,2.0,http://www.wikidata.org/entity/Q889,Afghanistan,15.0,AFG
1,791.0,Aland Islands,AX,ALA,,,,,,,,,4.0,http://www.wikidata.org/entity/Q5689,Åland,296.0,ALA
2,565.0,Albania,AL,ALB,914.0,ALB,339.0,ALB,AA,ALBN,ALB,ALB,4.0,http://www.wikidata.org/entity/Q222,Albania,16.0,ALB
3,619.0,Algeria,DZ,DZA,612.0,ALG,615.0,ALG,AE,ALGR,DZA,DZA,3.0,http://www.wikidata.org/entity/Q262,Algeria,17.0,DZA
4,571.0,American Samoa,AS,ASM,859.0,,,,AS,,ASM,ASM,6.0,http://www.wikidata.org/entity/Q16641,American Samoa,246.0,ASM


In [47]:
countries_with_entities.columns

In [48]:
countries_with_entities = countries_with_entities[
    [
        "code",
        "name",
        "iso_alpha2",
        "iso_alpha3",
        "imf_code",
        "cow_letter",
        "cow_code",
        "unctad_code",
        "marc_code",
        "ncd_code",
        "kansas_code",
        "penn_code",
        "continent",
        "wikidata_uri",
        "wikidata_label",
        "legacy_entity_id",
        "legacy_country_id",
    ]
]

In [49]:
countries_with_entities.head()

Unnamed: 0,code,name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,legacy_entity_id,legacy_country_id
0,AFG,Afghanistan,AF,AFG,512.0,AFG,700.0,AFG,AF,AFGN,AFG,AFG,2.0,http://www.wikidata.org/entity/Q889,Afghanistan,15.0,562.0
1,ALA,Aland Islands,AX,ALA,,,,,,,,,4.0,http://www.wikidata.org/entity/Q5689,Åland,296.0,791.0
2,ALB,Albania,AL,ALB,914.0,ALB,339.0,ALB,AA,ALBN,ALB,ALB,4.0,http://www.wikidata.org/entity/Q222,Albania,16.0,565.0
3,DZA,Algeria,DZ,DZA,612.0,ALG,615.0,ALG,AE,ALGR,DZA,DZA,3.0,http://www.wikidata.org/entity/Q262,Algeria,17.0,619.0
4,ASM,American Samoa,AS,ASM,859.0,,,,AS,,ASM,ASM,6.0,http://www.wikidata.org/entity/Q16641,American Samoa,246.0,571.0


In [50]:
countries_with_entities.to_feather("intermediate/03-countries-with-entitiyids.feather")