In [60]:
import pandas as pd
import numpy as np

## Load and format the temperature anomaly data

In [61]:
path_raw_temp_data = 'data/raw/temperature/'
path_processed_temp_data = 'data/processed/temperature/'

In [62]:
path_countries_land_files = 'countries-land/'
path_countries_list_file = 'temp-countries-list.csv'

In [63]:
# Todo: remove loading temp_countries from processed data when combining jupyter notebooks
temp_countries: pd.DataFrame = pd.read_csv(path_processed_temp_data+path_countries_list_file,sep=',')
print(temp_countries)

            Country ISO-alpha3
0       Afghanistan        AFG
1     Åland Islands        ALA
2           Albania        ALB
3           Algeria        DZA
4    American Samoa        ASM
..              ...        ...
228  Virgin Islands        VIR
229  Western Sahara        ESH
230           Yemen        YEM
231          Zambia        ZMB
232        Zimbabwe        ZWE

[233 rows x 2 columns]


### Load temp-land-country data

In [64]:
column_names = ["country_code", "country_name", "year", "month", "monthly_anomaly", "monthly_unc", "annual_anomaly", "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]

temp_land_country = pd.DataFrame(columns=column_names)

country_names = temp_countries['Country']

# Todo when combining use the dictionary in the data-integration notebook
new_country_names = {
    "Denmark (Europe)": "Denmark",
    "France (Europe)": "France",
    "Netherlands (Europe)": "Netherlands",
    "United Kingdom (Europe)": "United Kingdom",
    "Åland": "Åland Islands",
    "Czech Republic": "Czechia",
    "Turkey": "Türkiye",
    "Svalbard and Jan Mayen": "Svalbard and Jan Mayen Islands",
    "Cape Verde": "Cabo Verde",
    "Turks and Caicas Islands": "Turks and Caicos Islands",
    "Swaziland": "Eswatini",
    "Macedonia": "North Macedonia",
    "Côte d'Ivoire": "Côte d’Ivoire",
    "Federated States of Micronesia": "Micronesia (Federated States of)",
    "South Georgia and the South Sandwich Isla": "South Georgia and the South Sandwich Islands",
    "Bonaire, Saint Eustatius and Saba": "Bonaire, Sint Eustatius and Saba",
    "Congo (Democratic Republic of the)": "Democratic Republic of the Congo",
    "South Korea": "Korea, South",
    "North Korea": "Korea, North",
    "Palestina": "State of Palestine"
}

def swap_keys_values(d):
    return {v: k for k, v in d.items()}

map_country_to_filename = swap_keys_values(new_country_names)

for country_name in country_names:
    country_file_name = country_name
    if country_name in map_country_to_filename:
        country_file_name = map_country_to_filename[country_name]
    path = path_raw_temp_data + path_countries_land_files + country_file_name + '.txt'
    one_country = pd.read_csv(path, comment="%", header=None, delim_whitespace=True)
    iso_name = temp_countries.loc[temp_countries['Country'] == country_name]['ISO-alpha3'].iloc[0]

    one_country.insert(0, 'country_code', iso_name)
    one_country.insert(1, 'country_name', country_name)

    one_country.columns=column_names

    temp_land_country = pd.concat([temp_land_country, one_country])

# exclude antarctica
temp_land_country = temp_land_country[temp_land_country.country_name != 'Antarctica']

temp_land_country.head()

Unnamed: 0,country_code,country_name,year,month,monthly_anomaly,monthly_unc,annual_anomaly,annual_unc,five_year_anomaly,five_year_unc,ten_year_anomaly,ten_year_unc,twenty_year_anomaly,twenty_year_unc
0,AFG,Afghanistan,1848,5,-0.297,2.037,,,,,,,,
1,AFG,Afghanistan,1848,6,-0.796,2.136,,,,,,,,
2,AFG,Afghanistan,1848,7,-0.113,1.937,-0.777,0.639,,,,,,
3,AFG,Afghanistan,1848,8,-0.462,1.937,-0.743,0.644,,,,,,
4,AFG,Afghanistan,1848,9,-1.272,1.865,-0.676,0.669,,,,,,


### Create temp-land-region data from temp-land-country data

The region datasets get created from the temp-and-country dataset.

In [118]:
def calc_region_anomaly(temp_land_country_regions):
    regions = temp_land_country_regions[['region_code', 'region_name']].drop_duplicates()
    rows=[]
    for region_name in regions['region_name']:
        temp_land_country_single_region = temp_land_country_regions[temp_land_country_regions['region_name']==region_name]
        for year in range(1900, 2023):
            temp_land_country_single_region_year = temp_land_country_single_region[temp_land_country_single_region['year']==year]
            for month in range(1, 13):
                temp_land_country_single_region_year_month=temp_land_country_single_region_year[temp_land_country_single_region_year['month']==month]
                monthly_anomaly = temp_land_country_single_region_year_month['monthly_anomaly'].mean()
                new_row = {
                    'region_code': regions[regions['region_name']==region_name].region_code.item(),
                    'region_name': region_name,
                    'year': year,
                    'month': month,
                    'monthly_anomaly': monthly_anomaly
                }
                rows.append(new_row)
    return pd.DataFrame.from_dict(rows, orient='columns')

In [120]:
# load and format un-country-codes data
path_un_country_codes = "data/raw/country-codes/un-country-codes.csv"
un_country_codes = pd.read_csv(path_un_country_codes, sep=";")
un_country_codes_drop_labels=['Global Code', 'Global Name', 'Sub-region Code', 'Sub-region Name', 'Intermediate Region Code', 'Intermediate Region Name',
                              'Country or Area', 'M49 Code', 'ISO-alpha2 Code', 'Least Developed Countries (LDC)',
                              'Land Locked Developing Countries (LLDC)', 'Small Island Developing States (SIDS)']
un_country_codes.drop(labels=un_country_codes_drop_labels, axis=1 ,inplace=True)
un_country_codes.columns=['region_code', 'region_name', 'country_code']

# Join UN country codes to temp data
temp_land_country_regions = pd.merge(temp_land_country, un_country_codes, on='country_code', how='left')
temp_land_country_regions.loc[temp_land_country_regions['country_name']=='Taiwan','region_name'] = 'Asia'
temp_land_country_regions.loc[temp_land_country_regions['country_name']=='Taiwan','region_code'] = 142

# calculate anomaly data for regions
temp_land_region = calc_region_anomaly(temp_land_country_regions)

# Todo: compare to the original regional data

Unnamed: 0,region_code,region_name,year,month,monthly_anomaly
0,142.0,Asia,1900,1,-1.45838
1,142.0,Asia,1900,2,-0.69962
2,142.0,Asia,1900,3,0.09764
3,142.0,Asia,1900,4,0.0544
4,142.0,Asia,1900,5,0.00462
5,142.0,Asia,1900,6,-0.21686
6,142.0,Asia,1900,7,-0.1234
7,142.0,Asia,1900,8,-0.13176
8,142.0,Asia,1900,9,-0.32722
9,142.0,Asia,1900,10,0.2368


### Load temp-land-ocean-global data

Two versions exist that treat temperature anomalies at locations with sea ice:
1. Anomalies are extrapolated from land-surface air temperature anomalies.
2. Anomalies are extrapolated from sea-surface water temperature anomalies (usually collected from open water areas on the periphery of the sea ice).

We choose to use the air temperature version based on Berkleys remark:
"We believe that the use of air temperatures above sea ice provides a more natural means of describing changes in Earth's surface temperature."

In [None]:
path_global_land_ocean_file = 'global-land-ocean.txt'

In [None]:
# only include the dataset where anomalies are extrapolated from land-surface air temperature anomalies.
temp_land_ocean_global = pd.read_csv(path_raw_temp_data + path_global_land_ocean_file, comment="%", header=None, delim_whitespace=True, engine='python', skipfooter=2079)
temp_land_ocean_global.columns = ["year", "month", "monthly_anomaly", "monthly_unc", "annual_anomaly", "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]

### Format the data

In [None]:
def cut_before_1900(temp_data):
    return temp_data[temp_data['year'] >= 1900]

In [None]:
# Todo: function to only keep monhtly anomalies

In [None]:
temp_land_country = cut_before_1900(temp_land_country)
#temp_land_region = cut_before_1900(temp_land_region)
temp_land_ocean_global = cut_before_1900(temp_land_ocean_global)

### Save the data as csv files

In [None]:
temp_land_country.to_csv(path_processed_temp_data+'temp-land-country.csv',index=False)
#temp_land_region.to_csv(path_processed_temp_data+'temp-land-region.csv',index=False)
temp_land_ocean_global.to_csv(path_processed_temp_data+'temp-land-ocean-global.csv',index=False)