In [16]:
import numpy as np
import pandas as pd

## Load Datasets

In [17]:
disasters = pd.read_excel('data/raw/natural-disasters/emdat_public_2022_12_22_full.xlsx', skiprows=6, sheet_name="emdat data")
temperature_countries = pd.read_csv("data/raw/temperature/countries-list.csv", sep=";")
population_by_country = pd.read_excel('data/raw/population/gapminder-population-v7.xlsx', sheet_name="data-for-countries-etc-by-year")
population_by_region = pd.read_excel('data/raw/population/gapminder-population-v7.xlsx', sheet_name="data-for-regions-by-year")

un_country_codes = pd.read_csv("data/raw/country-codes/un-country-codes.csv", sep=";")
cia_country_codes = pd.read_csv("data/raw//country-codes/cia-country-codes.csv", sep="\t")

  warn("Workbook contains no default style, apply openpyxl's default")


Preprocess CIA dataset (since it is a bit messy)

In [18]:
cia_country_codes.head(3)

Unnamed: 0,Entity,GENC,ISO 3166,Stanag,Internet,Comment
0,Afghanistan,AFG,AF | AFG | 004,AFG,.af,-
1,Akrotiri,XQZ,- | - | -,-,-,-
2,Albania,ALB,AL | ALB | 008,ALB,.al,-


In [19]:
# split iso-codes in separate columns
cia_country_codes[["ISO-alpha2","ISO-alpha3","ISO-numeric"]] = cia_country_codes["ISO 3166"].str.split("|",2,expand=True)
cia_country_codes.drop(columns=["ISO 3166"], inplace=True)
# strip whitespaces from iso-codes
cia_country_codes[["ISO-alpha2","ISO-alpha3","ISO-numeric"]] = cia_country_codes[["ISO-alpha2","ISO-alpha3","ISO-numeric"]].apply(lambda x: x.str.strip())
# replace not existing iso-codes with NaN for more clarity
cia_country_codes["ISO-alpha2"].replace("-", np.nan, inplace=True)
cia_country_codes["ISO-alpha3"].replace("-", np.nan, inplace=True)
cia_country_codes["ISO-numeric"].replace("-", np.nan, inplace=True)
# show preprocessed cia data
cia_country_codes.head(3)

  cia_country_codes[["ISO-alpha2","ISO-alpha3","ISO-numeric"]] = cia_country_codes["ISO 3166"].str.split("|",2,expand=True)


Unnamed: 0,Entity,GENC,Stanag,Internet,Comment,ISO-alpha2,ISO-alpha3,ISO-numeric
0,Afghanistan,AFG,AFG,.af,-,AF,AFG,4.0
1,Akrotiri,XQZ,-,-,-,,,
2,Albania,ALB,ALB,.al,-,AL,ALB,8.0


## Add ISO Codes to Temperature dataset

### Remove Aggregated Countries

In [20]:
temperature_countries.shape

(237, 2)

We can see, that e.g. Denmark appears twice. This issue happens multiple times, and is due to the reason that some countries are aggregates of other countries e.g. `Denmark` consists of `Denmark (Europe)` also known as `Denmark Mainland`, and `Greenland`. The bearkley earth website has a worldmap on which the country is highlighted, this helped us to better understand what each of the conflicting countries is.

In [21]:
temperature_countries.iloc[55: 55+5]

Unnamed: 0,Country,Region
55,Cyprus,Asia
56,Czech Republic,Europe
57,Denmark,North America
58,Denmark (Europe),Europe
59,Djibouti,Africa


We decided to remove the "aggregated" country. Here is a list of the aggregate countries we removed, their individual parts still exists in the dataset:
- Denmark (Denmark Mainland, Greenland)
- France (France Mainland, French Guiana, French Polynesia, French Southern and Antarctic Lands)
- Netherlands (Netherlands Mainland, Sint Maarten, Curaçao, Aruba)
- United Kingdom (United Kingdom + Oversea territories such as Montserrat, Bermuda)

In [22]:
temperature_countries_remove = pd.DataFrame({
    "Country": ["Denmark","France", "Netherlands", "United Kingdom"],
    "Region": ["North America", np.nan, "Europe", "Europe"]
})
temperature_countries_cleaned = pd.concat([temperature_countries, temperature_countries_remove]).drop_duplicates(keep=False)
temperature_countries_cleaned.shape

(233, 2)

### Rename Countries & Match ISO Codes

In [23]:
# some countries need to be renamed so that we find the matching country-code later
new_country_names = {
    "Denmark (Europe)": "Denmark",
    "France (Europe)": "France",
    "Netherlands (Europe)": "Netherlands",
    "United Kingdom (Europe)": "United Kingdom",
    "Åland": "Åland Islands",
    "Czech Republic": "Czechia",
    "Turkey": "Türkiye",
    "Svalbard and Jan Mayen": "Svalbard and Jan Mayen Islands",
    "Cape Verde": "Cabo Verde",
    "Turks and Caicas Islands": "Turks and Caicos Islands",
    "Swaziland": "Eswatini",
    "Macedonia": "North Macedonia",
    "Côte d'Ivoire": "Côte d’Ivoire",
    "Federated States of Micronesia": "Micronesia (Federated States of)",
    "South Georgia and the South Sandwich Isla": "South Georgia and the South Sandwich Islands",
    "Bonaire, Saint Eustatius and Saba": "Bonaire, Sint Eustatius and Saba",
    "Congo (Democratic Republic of the)": "Democratic Republic of the Congo",
    "South Korea": "Korea, South",
    "North Korea": "Korea, North",
    "Palestina": "State of Palestine"
}

temperature_countries_cleaned = temperature_countries_cleaned.replace({"Country": new_country_names}, inplace=False)

# left-join cia-country-codes and un-country-codes
temperature_countries_with_iso = temperature_countries_cleaned.merge(cia_country_codes,how="left",left_on='Country', right_on='Entity')[["Country","ISO-alpha3"]]
temperature_countries_with_iso = temperature_countries_with_iso.merge(un_country_codes,how="left",left_on='Country', right_on='Country or Area')[["Country","ISO-alpha3", "ISO-alpha3 Code"]]

# fill missing cia-country codes with un-country-codes
temperature_countries_with_iso["ISO-alpha3"].fillna(temperature_countries_with_iso["ISO-alpha3 Code"], inplace=True)
temperature_countries_with_iso.drop(columns=["ISO-alpha3 Code"], inplace=True)

# show countries for which we could not find an ISO code
temperature_countries_with_iso[temperature_countries_with_iso["ISO-alpha3"].isna()]

Unnamed: 0,Country,ISO-alpha3
18,Baker Island,
113,Kingman Reef,
161,Palmyra Atoll,


These 3 countries/areas do not have any country codes in general, and are quite small, so we just ignore them later on.

In [24]:
# Todo: When combining adjust to use variable without saving temperature_countries_with_iso as csv file
processed_countries_list_path = 'data/processed/temperature/temp-countries-list.csv'
temperature_countries_with_iso.to_csv(processed_countries_list_path, index=False)

OSError: Cannot save file into a non-existent directory: 'data/processed/temperature'

## Which countries are in which datasets?

### Disaster vs. Temperature Dataset


In [26]:
berkely_iso_codes = set(temperature_countries_with_iso["ISO-alpha3"].dropna().tolist())
emdat_iso_codes = set(disasters["ISO"].unique().tolist())

emdat_and_bekely = emdat_iso_codes.intersection(berkely_iso_codes)
emdat_without_berkely = emdat_iso_codes-emdat_and_bekely
berkely_without_emdat = berkely_iso_codes-emdat_and_bekely

print(f"countries in emdat & berkely: {len(emdat_and_bekely)}")
print(f"countries in emdat but not berkely ({len(emdat_without_berkely)}):")
print(sorted(emdat_without_berkely))
print(f"countries in berkely but not emdat ({len(berkely_without_emdat)}):")
print(sorted(berkely_without_emdat))

countries in emdat & berkely: 209
countries in emdat but not berkely (22):
['ANT', 'AZO', 'BMU', 'BRN', 'COK', 'CSK', 'DDR', 'DFR', 'MDV', 'MHL', 'SCG', 'SHN', 'SPI', 'SSD', 'SUN', 'TKL', 'TUV', 'VUT', 'WLF', 'YMD', 'YMN', 'YUG']
countries in berkely but not emdat (20):
['ABW', 'ALA', 'AND', 'ATA', 'ATF', 'BES', 'CXR', 'ESH', 'FLK', 'FRO', 'GGY', 'GRL', 'HMD', 'JEY', 'LIE', 'MCO', 'SGS', 'SJM', 'SMR', 'SPM']


The Countries for which we have disaster data, but no temperature data are as follows:
- Existing Countries (usually very small countries/islands):
    - AZO Azores Islands
    - BMU Bermuda
    - BRN Brunei Darussalam
    - COK Cook Islands (the)
    - MDV Maldives
    - MHL Marshall Islands (the)
    - SHN Saint Helena, Ascension and Tristan da Cunha
    - SSD South Sudan
    - TKL Tokelau
    - TUV Tuvalu
    - VUT Vanuatu
    - WLF Wallis and Futuna
- Existing Countries (but invalid country code):
    - SPI Canary Islands
- Former Countries:
    - ANT Netherlands Antilles
    - CSK Czechoslovakia
    - DDR Germany Dem Rep
    - DFR Germany Fed Rep
    - SCG Serbia Montenegro
    - SUN Soviet Union
    - YMD Yemen P Dem Rep
    - YMN Yemen Arab Rep
    - YUG Yugoslavia

### Disaster vs Population Dataset

In [27]:
gapminder_iso_codes = set(population_by_country["geo"].str.upper().unique())

In [28]:
emdat_and_gapminder = emdat_iso_codes.intersection(gapminder_iso_codes)
emdat_without_gapminder = emdat_iso_codes-emdat_and_gapminder
gapminder_without_emdat = gapminder_iso_codes-emdat_and_gapminder

print(f"countries in emdat & gapminder: {len(emdat_and_gapminder)}")
print(f"countries in emdat but not gapminder ({len(emdat_without_gapminder)}):")
print(sorted(emdat_without_gapminder))
print(f"countries in gapminder but not emdat ({len(gapminder_without_emdat)}):")
print(sorted(gapminder_without_emdat))

countries in emdat & gapminder: 191
countries in emdat but not gapminder (40):
['AIA', 'ANT', 'ASM', 'AZO', 'BLM', 'BMU', 'COK', 'CSK', 'CUW', 'CYM', 'DDR', 'DFR', 'GLP', 'GUF', 'GUM', 'IMN', 'MAC', 'MAF', 'MNP', 'MSR', 'MTQ', 'MYT', 'NCL', 'NIU', 'PRI', 'PYF', 'REU', 'SCG', 'SHN', 'SPI', 'SUN', 'SXM', 'TCA', 'TKL', 'VGB', 'VIR', 'WLF', 'YMD', 'YMN', 'YUG']
countries in gapminder but not emdat (6):
['AND', 'HOS', 'LIE', 'MCO', 'NRU', 'SMR']


The Countries for which we have disaster data, but no population data are as follows:
- Existing Countries (independent)
    - COK Cook Islands (the)
    - NIU Niue
- Existing Countries (dependent e.g .oversea territories)
    - AIA Anguilla
    - ASM American Samoa
    - AZO Azores Islands
    - BLM Saint Barthélemy
    - BMU Bermuda
    - CUW Curaçao
    - CYM Cayman Islands (the)
    - GLP Guadeloupe
    - GUF French Guiana
    - GUM Guam
    - IMN Isle of Man
    - MAC Macao
    - MAF Saint Martin (French Part)
    - MNP Northern Mariana Islands (the)
    - MSR Montserrat
    - MTQ Martinique
    - MYT Mayotte
    - NCL New Caledonia
    - PRI Puerto Rico
    - PYF French Polynesia
    - REU Réunion
    - SHN Saint Helena, Ascension and Tristan da Cunha
    - SPI Canary Islands
    - SXM Sint Maarten (Dutch part)
    - TCA Turks and Caicos Islands (the)
    - TKL Tokelau
    - VGB Virgin Island (British)
    - VIR Virgin Island (U.S.)
    - WLF Wallis and Futuna
- Former Countries
    - ANT Netherlands Antilles
    - CSK Czechoslovakia
    - DDR Germany Dem Rep
    - DFR Germany Fed Rep
    - SCG Serbia Montenegro
    - SUN Soviet Union
    - YMD Yemen P Dem Rep
    - YMN Yemen Arab Rep
    - YUG Yugoslavia

# Which regions are in which datasets?

### Overview

In [29]:
disasters["Continent"].unique().tolist()

['Africa', 'Asia', 'Europe', 'Americas', 'Oceania']

In [30]:
population_by_region["geo"].unique().tolist()

['africa', 'asia', 'europe', 'americas']

In [31]:
temperature_countries["Region"].dropna().unique().tolist()

['Asia', 'Europe', 'Africa', 'South America', 'Oceania', 'North America']

=> maybe it's better to compute the region data by aggregating the countries, according to UN Regions

### Determine Regions manually

In [32]:
un_iso_codes = set(un_country_codes["ISO-alpha3 Code"].tolist())

emdat_without_un = emdat_iso_codes-un_iso_codes
gapminder_without_un = gapminder_iso_codes-un_iso_codes
berkely_without_un = berkely_iso_codes-un_iso_codes

print(f"countries in emdat, but not un ({len(emdat_without_un)}): \n{emdat_without_un}")
print(f"countries in gapminder, but not un ({len(gapminder_without_un)}): \n{gapminder_without_un}")
print(f"countries in berkely, but not un ({len(berkely_without_un)}): \n{berkely_without_un}")

countries in emdat, but not un (12): 
{'SCG', 'YMN', 'YMD', 'DDR', 'SPI', 'SUN', 'ANT', 'DFR', 'YUG', 'TWN', 'AZO', 'CSK'}
countries in gapminder, but not un (2): 
{'TWN', 'HOS'}
countries in berkely, but not un (1): 
{'TWN'}
