## Add wikidata identifiers for all rows with ISO alpha 3 codes

In [1]:
df = pd.read_feather("entities/01-countries-from-mysql.feather")

### Get all wikidata entity ids for countries with ISO alpha 3 code + english common name

We do this for two reasons. First we want to store the wikidata ids anyhow, and second we will compare the english common names for matches we get. We would expect that all our ISO alpha 3 codes to be the official ones so if we either do not get a reply for an entity or the country name is very different then something odd may be happening.

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [3]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

In [4]:
pd.options.display.max_rows = 30
pd.options.display.min_rows = 30

⚡ For the query below: P31 is "instance of", Q3624078 is "sovereign state". P297 is ISO alpha 2, P298 is ISO alpha 3

In [5]:
sparql.setQuery("""
SELECT DISTINCT ?country ?countryLabel ?isoalpha2 ?isoalpha2Label ?isoalpha3 ?isoalpha3Label
WHERE
{
  ?country wdt:P31 wd:Q3624078 .
  ?country wdt:P297 ?isoalpha2 .
  ?country wdt:P298 ?isoalpha3 .
  #not a former country
  FILTER NOT EXISTS {?country wdt:P31 wd:Q3024240}
  #and no an ancient civilisation (needed to exclude ancient Egypt)
  FILTER NOT EXISTS {?country wdt:P31 wd:Q28171280} .

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
ORDER BY ?countryLabel
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [6]:
sparql_df = pd.json_normalize(results['results']['bindings'])
wikidata_countries = sparql_df[["country.value", "countryLabel.value", "isoalpha2.value", "isoalpha3.value"]]

In [7]:
wikidata_countries = wikidata_countries.rename(columns={"country.value": "wikidata_uri", "countryLabel.value": "wikidata_label", "isoalpha2.value": "iso_alpha2", "isoalpha3.value": "iso_alpha3"})

In [8]:
wikidata_countries

Unnamed: 0,wikidata_uri,wikidata_label,iso_alpha2,iso_alpha3
0,http://www.wikidata.org/entity/Q889,Afghanistan,AF,AFG
1,http://www.wikidata.org/entity/Q222,Albania,AL,ALB
2,http://www.wikidata.org/entity/Q262,Algeria,DZ,DZA
3,http://www.wikidata.org/entity/Q228,Andorra,AD,AND
4,http://www.wikidata.org/entity/Q916,Angola,AO,AGO
5,http://www.wikidata.org/entity/Q781,Antigua and Barbuda,AG,ATG
6,http://www.wikidata.org/entity/Q414,Argentina,AR,ARG
7,http://www.wikidata.org/entity/Q399,Armenia,AM,ARM
8,http://www.wikidata.org/entity/Q408,Australia,AU,AUS
9,http://www.wikidata.org/entity/Q40,Austria,AT,AUT


#### Create two sets of ISO Alpha 3 codes

In [9]:
wikidata_alpha3 = set(wikidata_countries["iso_alpha3"])
len(wikidata_alpha3)

195

In [10]:
owid_alpha3 = set(df.iso_alpha3)
len(owid_alpha3)

251

❔ Any wikidata alpha 3 codes that do not occur in owid?

In [11]:
len(wikidata_alpha3.difference(owid_alpha3))

0

❔ Which owid alpha 3 codes are not in the official wikidata countries?

In [12]:
owid_alpha3_not_in_wikidata = owid_alpha3.difference(wikidata_alpha3)

In [13]:
df[df.iso_alpha3.isin(owid_alpha3_not_in_wikidata) & (~df.iso_alpha3.isnull())]

Unnamed: 0,id,owid_name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent
1,791,Aland Islands,AX,ALA,,,,,,,,,4.0
4,571,American Samoa,AS,ASM,859.0,,,,AS,,ASM,ASM,6.0
7,564,Anguilla,AI,AIA,312.0,,,ANL,AM,,AIA,AIA,1.0
8,792,Antarctica,AQ,ATA,,,,,,,,,7.0
12,561,Aruba,AW,ABW,314.0,,,ARU,AW,,ABW,ABW,1.0
27,587,Bermuda,BM,BMU,319.0,,,BER,BM,,BMU,BMU,1.0
30,794,Bonaire Sint Eustatius and Saba,BQ,BES,,,,,,,,,1.0
33,796,Bouvet Island,BV,BVT,,,,,,,,,7.0
35,801,British Indian Ocean Territory,IO,IOT,,,,,,,,,2.0
36,781,British Virgin Islands,VG,VGB,,,,BVI,VB,,VGB,VGB,1.0


### Let's get data on the remaining entities with ISO alpha 3 codes

In [14]:
remaining_codes = " ".join([f"\"{s}\"" for s in owid_alpha3_not_in_wikidata if s != None])
remaining_codes

'"NFK" "ASM" "GUM" "SXM" "IOT" "VIR" "CCK" "PRI" "TKL" "NIU" "BMU" "BVT" "WLF" "ABW" "CXR" "GIB" "REU" "SGS" "JEY" "MAF" "HMD" "PCN" "IMN" "FLK" "CYM" "TCA" "PSE" "UMI" "SJM" "COK" "ATF" "MNP" "GUF" "SHN" "BLM" "HKG" "BES" "PYF" "GLP" "MYT" "NCL" "GRL" "MAC" "GGY" "MTQ" "MSR" "AIA" "VGB" "ANT" "CUW" "SPM" "ALA" "FRO" "ESH" "ATA"'

In [15]:
sparql.setQuery(f"""
SELECT DISTINCT ?country ?countryLabel ?isoalpha2 ?isoalpha2Label ?isoalpha3 ?isoalpha3Label
WHERE
{{
  ?country wdt:P298 ?isocodes .
  ?country wdt:P297 ?isoalpha2 .
  ?country wdt:P298 ?isoalpha3 .
  VALUES ?isocodes {{ {remaining_codes} }}.
  #not a former country

  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
}}
ORDER BY ?countryLabel
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

In [16]:
sparql_df = pd.json_normalize(results['results']['bindings'])
wikidata_remaining = sparql_df[["country.value", "countryLabel.value", "isoalpha2.value", "isoalpha3.value"]]

In [17]:
wikidata_remaining = wikidata_remaining.rename(columns={"country.value": "wikidata_uri", "countryLabel.value": "wikidata_label", "isoalpha2.value": "iso_alpha2", "isoalpha3.value": "iso_alpha3"})

In [18]:
wikidata_remaining

Unnamed: 0,wikidata_uri,wikidata_label,iso_alpha2,iso_alpha3
0,http://www.wikidata.org/entity/Q16641,American Samoa,AS,ASM
1,http://www.wikidata.org/entity/Q25228,Anguilla,AI,AIA
2,http://www.wikidata.org/entity/Q21590062,Antarctic Treaty area,AQ,ATA
3,http://www.wikidata.org/entity/Q21203,Aruba,AW,ABW
4,http://www.wikidata.org/entity/Q23635,Bermuda,BM,BMU
5,http://www.wikidata.org/entity/Q23408,Bouvet Island,BV,BVT
6,http://www.wikidata.org/entity/Q43448,British Indian Ocean Territory,IO,IOT
7,http://www.wikidata.org/entity/Q25305,British Virgin Islands,VG,VGB
8,http://www.wikidata.org/entity/Q27561,Caribbean Netherlands,BQ,BES
9,http://www.wikidata.org/entity/Q5785,Cayman Islands,KY,CYM


#### Merge wikidata dataframes

In [19]:
merged_wikidata = pd.merge(wikidata_countries, wikidata_remaining, how="outer")

In [20]:
merged_wikidata

Unnamed: 0,wikidata_uri,wikidata_label,iso_alpha2,iso_alpha3
0,http://www.wikidata.org/entity/Q889,Afghanistan,AF,AFG
1,http://www.wikidata.org/entity/Q222,Albania,AL,ALB
2,http://www.wikidata.org/entity/Q262,Algeria,DZ,DZA
3,http://www.wikidata.org/entity/Q228,Andorra,AD,AND
4,http://www.wikidata.org/entity/Q916,Angola,AO,AGO
5,http://www.wikidata.org/entity/Q781,Antigua and Barbuda,AG,ATG
6,http://www.wikidata.org/entity/Q414,Argentina,AR,ARG
7,http://www.wikidata.org/entity/Q399,Armenia,AM,ARM
8,http://www.wikidata.org/entity/Q408,Australia,AU,AUS
9,http://www.wikidata.org/entity/Q40,Austria,AT,AUT


### Merge owid and wikidata dataframes

Since owid has more entities we do a left join and some wikidata columns will be None for a few cells

In [21]:
merged_df = df.merge(right=merged_wikidata, how="left", on="iso_alpha3")

In [22]:
merged_df.head()

Unnamed: 0,id,owid_name,iso_alpha2_x,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,iso_alpha2_y
0,562,Afghanistan,AF,AFG,512.0,AFG,700.0,AFG,AF,AFGN,AFG,AFG,2.0,http://www.wikidata.org/entity/Q889,Afghanistan,AF
1,791,Aland Islands,AX,ALA,,,,,,,,,4.0,http://www.wikidata.org/entity/Q5689,Åland,AX
2,565,Albania,AL,ALB,914.0,ALB,339.0,ALB,AA,ALBN,ALB,ALB,4.0,http://www.wikidata.org/entity/Q222,Albania,AL
3,619,Algeria,DZ,DZA,612.0,ALG,615.0,ALG,AE,ALGR,DZA,DZA,3.0,http://www.wikidata.org/entity/Q262,Algeria,DZ
4,571,American Samoa,AS,ASM,859.0,,,,AS,,ASM,ASM,6.0,http://www.wikidata.org/entity/Q16641,American Samoa,AS


❔ which entries have different values for owid_name and wikidata_label?

In [23]:
merged_df[~merged_df.wikidata_uri.isnull() & (merged_df.owid_name != merged_df.wikidata_label)]

Unnamed: 0,id,owid_name,iso_alpha2_x,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,iso_alpha2_y
1,791,Aland Islands,AX,ALA,,,,,,,,,4.0,http://www.wikidata.org/entity/Q5689,Åland,AX
8,792,Antarctica,AQ,ATA,,,,,,,,,7.0,http://www.wikidata.org/entity/Q21590062,Antarctic Treaty area,AQ
18,583,Bahamas,BS,BHS,313.0,BHM,31.0,BHA,BF,,BHS,BHS,1.0,http://www.wikidata.org/entity/Q778,The Bahamas,BS
30,794,Bonaire Sint Eustatius and Saba,BQ,BES,,,,,,,,,1.0,http://www.wikidata.org/entity/Q27561,Caribbean Netherlands,BQ
51,599,China,CN,CHN,924.0,CHN,710.0,CPR,CH,CHNA,CHN,CHN,2.0,http://www.wikidata.org/entity/Q148,People's Republic of China,CN
53,596,Cocos Islands,CC,CCK,,,,,XB,,,CCK,2.0,http://www.wikidata.org/entity/Q36004,Cocos (Keeling) Islands,CC
56,603,Congo,CG,COG,634.0,CON,484.0,PRC,CF,CNGO,,COG,3.0,http://www.wikidata.org/entity/Q971,Republic of the Congo,CG
59,600,Cote d'Ivoire,CI,CIV,662.0,CDI,437.0,IVC,IV,IVCT,CIV,CIV,3.0,http://www.wikidata.org/entity/Q1008,Ivory Coast,CI
62,797,Curacao,CW,CUW,,,,,,,,,1.0,http://www.wikidata.org/entity/Q25279,Curaçao,CW
64,613,Czechia,CZ,CZE,935.0,CZR,316.0,CZE,XR,,CZE,CZE,4.0,http://www.wikidata.org/entity/Q213,Czech Republic,CZ


❔ Are any iso_alpha2 code different?

In [24]:
merged_df[~merged_df.wikidata_uri.isnull() & (merged_df.iso_alpha2_x != merged_df.iso_alpha2_y)]

Unnamed: 0,id,owid_name,iso_alpha2_x,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,iso_alpha2_y


❔ Which entities are left over that do not have a wikidata id?

In [25]:
merged_df[merged_df.wikidata_uri.isnull()]

Unnamed: 0,id,owid_name,iso_alpha2_x,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label,iso_alpha2_y
15,822,Austria-Hungary,,,,AUH,300.0,,,,,,4.0,,,
17,816,Baden,,,,BAD,267.0,,,,,,4.0,,,
22,813,Bavaria,,,,BAV,245.0,,,,,,4.0,,,
45,837,Caribbean Netherlands,,,,,,,,,,,5.0,,,
49,834,Channel Islands,,,,,,,,,,,4.0,,,
65,823,Czechoslovakia,,,,CZE,315.0,,,,,,4.0,,,
71,815,East Germany,,,,GDR,265.0,,,,,,4.0,,,
105,812,Hanover,,,,HAN,240.0,,,,,,4.0,,,
107,819,Hesse Electoral,,,,HSE,273.0,,,,,,4.0,,,
108,820,Hesse Grand Ducal,,,,HSG,275.0,,,,,,4.0,,,


In [26]:
merged_df.shape

(280, 16)

## Clean merged dataframe and save the new version with wikidata ids

In [27]:
merged_df.drop(["iso_alpha2_y"], axis=1, inplace=True)

In [28]:
merged_df.rename(columns={"iso_alpha2_x": "iso_alpha2"}, inplace=True)

In [29]:
merged_df.shape

(280, 15)

In [30]:
merged_df.head()

Unnamed: 0,id,owid_name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,continent,wikidata_uri,wikidata_label
0,562,Afghanistan,AF,AFG,512.0,AFG,700.0,AFG,AF,AFGN,AFG,AFG,2.0,http://www.wikidata.org/entity/Q889,Afghanistan
1,791,Aland Islands,AX,ALA,,,,,,,,,4.0,http://www.wikidata.org/entity/Q5689,Åland
2,565,Albania,AL,ALB,914.0,ALB,339.0,ALB,AA,ALBN,ALB,ALB,4.0,http://www.wikidata.org/entity/Q222,Albania
3,619,Algeria,DZ,DZA,612.0,ALG,615.0,ALG,AE,ALGR,DZA,DZA,3.0,http://www.wikidata.org/entity/Q262,Algeria
4,571,American Samoa,AS,ASM,859.0,,,,AS,,ASM,ASM,6.0,http://www.wikidata.org/entity/Q16641,American Samoa


In [31]:
merged_df.to_feather("entities/02-countries-with-wikidata-ids.feather")