In [115]:
import pandas as pd

## Load and format the temperature anomaly data

In [116]:
PATH_RAW_TEMP_DATA_FOLDER = 'data/raw/temperature/'
PATH_PROCESSED_TEMP_DATA_FOLDER = 'data/processed/temperature/'
PATH_COUNTRIES_LAND_FOLDER = 'countries-land/'
PATH_REGIONS_LAND_FOLDER = 'regions-land/'

In [117]:
PATH_COUNTRIES_LIST_FILE = 'temp-countries-list.csv'
PATH_UN_COUNTRY_CODES_FILE = "data/raw/country-codes/un-country-codes.csv"

In [118]:
# Todo: remove loading temp_countries from processed data when combining jupyter notebooks
temp_countries: pd.DataFrame = pd.read_csv(PATH_PROCESSED_TEMP_DATA_FOLDER + PATH_COUNTRIES_LIST_FILE, sep=',')
print(temp_countries)

            Country ISO-alpha3
0       Afghanistan        AFG
1     Åland Islands        ALA
2           Albania        ALB
3           Algeria        DZA
4    American Samoa        ASM
..              ...        ...
228  Virgin Islands        VIR
229  Western Sahara        ESH
230           Yemen        YEM
231          Zambia        ZMB
232        Zimbabwe        ZWE

[233 rows x 2 columns]


### Load temp-land-country data

In [119]:
temp_land_country_column_names = ["country_code", "country_name", "year", "month", "monthly_anomaly", "monthly_unc", "annual_anomaly",
                                  "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]

temp_land_country = pd.DataFrame(columns=temp_land_country_column_names)

temp_country_names = temp_countries['Country']

# Todo when combining use the dictionary in the data-integration notebook
new_country_names = {
    "Denmark (Europe)": "Denmark",
    "France (Europe)": "France",
    "Netherlands (Europe)": "Netherlands",
    "United Kingdom (Europe)": "United Kingdom",
    "Åland": "Åland Islands",
    "Czech Republic": "Czechia",
    "Turkey": "Türkiye",
    "Svalbard and Jan Mayen": "Svalbard and Jan Mayen Islands",
    "Cape Verde": "Cabo Verde",
    "Turks and Caicas Islands": "Turks and Caicos Islands",
    "Swaziland": "Eswatini",
    "Macedonia": "North Macedonia",
    "Côte d'Ivoire": "Côte d’Ivoire",
    "Federated States of Micronesia": "Micronesia (Federated States of)",
    "South Georgia and the South Sandwich Isla": "South Georgia and the South Sandwich Islands",
    "Bonaire, Saint Eustatius and Saba": "Bonaire, Sint Eustatius and Saba",
    "Congo (Democratic Republic of the)": "Democratic Republic of the Congo",
    "South Korea": "Korea, South",
    "North Korea": "Korea, North",
    "Palestina": "State of Palestine"
}

def swap_keys_values(d):
    return {v: k for k, v in d.items()}

map_country_to_filename = swap_keys_values(new_country_names)

for temp_country_name in temp_country_names:
    temp_country_file_name = temp_country_name
    if temp_country_name in map_country_to_filename:
        temp_country_file_name = map_country_to_filename[temp_country_name]
    path_temp_country_txt_file = PATH_RAW_TEMP_DATA_FOLDER + PATH_COUNTRIES_LAND_FOLDER + temp_country_file_name + '.txt'
    temp_land_one_country = pd.read_csv(path_temp_country_txt_file, comment="%", header=None, delim_whitespace=True)
    country_iso_name = temp_countries.loc[temp_countries['Country'] == temp_country_name]['ISO-alpha3'].iloc[0]

    temp_land_one_country.insert(0, 'country_code', country_iso_name)
    temp_land_one_country.insert(1, 'country_name', temp_country_name)

    temp_land_one_country.columns=temp_land_country_column_names

    temp_land_country = pd.concat([temp_land_country, temp_land_one_country])

temp_land_country.head()

Unnamed: 0,country_code,country_name,year,month,monthly_anomaly,monthly_unc,annual_anomaly,annual_unc,five_year_anomaly,five_year_unc,ten_year_anomaly,ten_year_unc,twenty_year_anomaly,twenty_year_unc
0,AFG,Afghanistan,1848,5,-0.297,2.037,,,,,,,,
1,AFG,Afghanistan,1848,6,-0.796,2.136,,,,,,,,
2,AFG,Afghanistan,1848,7,-0.113,1.937,-0.777,0.639,,,,,,
3,AFG,Afghanistan,1848,8,-0.462,1.937,-0.743,0.644,,,,,,
4,AFG,Afghanistan,1848,9,-1.272,1.865,-0.676,0.669,,,,,,


### Create temp-land-region data from temp-land-country data

The region datasets get created from the temp-and-country dataset.

In [120]:
def calc_region_anomaly(temp_land_country_regions):
    temp_return_land_region = temp_land_country_regions.groupby(['region_code', 'region_name', 'year', 'month'], as_index=False)['monthly_anomaly'].mean()
    temp_return_land_region.rename({ 'monthly_anomaly': 'temperature_anomaly'}, axis=1,inplace=True)
    return temp_return_land_region

In [121]:
# Todo: only load UN country codes once after comnining notebook
# load and format un-country-codes data
un_country_codes = pd.read_csv(PATH_UN_COUNTRY_CODES_FILE, sep=";")
un_country_codes = un_country_codes[['Region Code', 'Region Name', 'ISO-alpha3 Code']]
un_country_codes.columns=['region_code', 'region_name', 'country_code']

# Join UN country codes to temp data
# exclude temp data for antarctica when calculating regional temp data
temp_land_country_no_antarctica = temp_land_country[temp_land_country.country_name != 'Antarctica']
temp_land_country_regions = pd.merge(temp_land_country_no_antarctica, un_country_codes, on='country_code', how='left')
temp_land_country_regions.loc[temp_land_country_regions['country_name']=='Taiwan','region_name'] = 'Asia'
temp_land_country_regions.loc[temp_land_country_regions['country_name']=='Taiwan','region_code'] = 142

# calculate anomaly data for regions
temp_land_region = calc_region_anomaly(temp_land_country_regions)

Load berkleys regional data

In [122]:
temp_land_country_column_names = ["region_name", "year", "month", "monthly_anomaly", "monthly_unc", "annual_anomaly", "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]
temp_berkley_regions = ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']
temp_berkleys_land_region = pd.DataFrame(columns=temp_land_country_column_names)

# Todo: avoid for loop, change to aggregator function
for temp_berkley_region in temp_berkley_regions:
    region_file_name = temp_berkley_region
    path = PATH_RAW_TEMP_DATA_FOLDER + PATH_REGIONS_LAND_FOLDER + region_file_name + '.txt'
    one_region = pd.read_csv(path, comment="%", header=None, delim_whitespace=True)

    one_region.insert(0, 'region_name', temp_berkley_region)
    one_region.columns=temp_land_country_column_names

    temp_berkleys_land_region = pd.concat([temp_berkleys_land_region, one_region])

### Load temp-land-ocean-global data

Two versions exist that treat temperature anomalies at locations with sea ice:
1. Anomalies are extrapolated from land-surface air temperature anomalies.
2. Anomalies are extrapolated from sea-surface water temperature anomalies (usually collected from open water areas on the periphery of the sea ice).

We choose to use the air temperature version based on Berkleys remark:
"We believe that the use of air temperatures above sea ice provides a more natural means of describing changes in Earth's surface temperature."

In [123]:
path_global_land_ocean_file = 'global-land-ocean.txt'

In [124]:
# only include the dataset where anomalies are extrapolated from land-surface air temperature anomalies.
temp_land_ocean_global = pd.read_csv(PATH_RAW_TEMP_DATA_FOLDER + path_global_land_ocean_file, comment="%", header=None, delim_whitespace=True, engine='python', skipfooter=2079)
temp_land_ocean_global.columns = ["year", "month", "monthly_anomaly", "monthly_unc", "annual_anomaly", "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]

### Format the data

We only keep entries after the year 1900

In [125]:
def cut_before_1900(temp_data):
    return temp_data[temp_data['year'] >= 1900]

We only keep monthly anomaly data and rename the column to temperature_anomaly

In [126]:
def keep_monthly_anomalies(temp_data):
    drop_labels = ["monthly_unc", "annual_anomaly", "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]
    temp_data = temp_data.drop(labels=drop_labels, axis=1)
    return temp_data.rename(columns={'monthly_anomaly': 'temperature_anomaly'})

#### Format temp-land-country

In [127]:
temp_land_country = cut_before_1900(temp_land_country)

#### Format temp-land-region and berkleys regional data

In [128]:
temp_land_region = cut_before_1900(temp_land_region)
temp_berkleys_land_region = cut_before_1900(temp_berkleys_land_region)
temp_berkleys_land_region = keep_monthly_anomalies(temp_berkleys_land_region)

# Todo: format berkleys North & South American data to UN America data

#### Format temp-land-ocean-global

In [129]:
temp_land_ocean_global = cut_before_1900(temp_land_ocean_global)
temp_land_ocean_global = keep_monthly_anomalies(temp_land_ocean_global)

#### Compare berkleys and our regional data

In [130]:
def temp_calc_region_corr(temp_single_region):
    temp_land_single_region = temp_land_region[(temp_land_region['region_name'] == temp_single_region)].set_index(['year', 'month'])
    temp_berkleys_land_single_region = temp_berkleys_land_region[(temp_berkleys_land_region['region_name'] == temp_single_region)].set_index(['year', 'month'])

    comp_temp_land_africa_joined = temp_berkleys_land_single_region.join(temp_land_single_region, on=['year', 'month'], lsuffix='_berkleys', rsuffix='_self')

    temp_land_region_corr = comp_temp_land_africa_joined['temperature_anomaly_berkleys'].corr(comp_temp_land_africa_joined['temperature_anomaly_self'])

    print('Correlation for region ' + temp_single_region + ' ' + str(temp_land_region_corr))

Correlation of temperature anomalies between our and berkleys regional temp data sets

In [131]:
temp_calc_region_corr('Africa')
temp_calc_region_corr('Asia')
temp_calc_region_corr('Europe')
temp_calc_region_corr('Oceania')

# Todo: also compare Berkleys america data

Correlation for region Africa 0.9706252186825862
Correlation for region Asia 0.6557149670319498
Correlation for region Europe 0.9850257827303103
Correlation for region Oceania 0.5959564544896369


### Save the data as csv files

In [132]:
temp_land_country.to_csv(PATH_PROCESSED_TEMP_DATA_FOLDER + 'temp-land-country.csv', index=False)
temp_land_region.to_csv(PATH_PROCESSED_TEMP_DATA_FOLDER + 'temp-land-region.csv', index=False)
temp_land_ocean_global.to_csv(PATH_PROCESSED_TEMP_DATA_FOLDER + 'temp-land-ocean-global.csv', index=False)