In [15]:
import pandas as pd
import os

## Load and format the temperature anomaly data

In [16]:
PATH_RAW_TEMP_DATA_FOLDER = 'data/raw/temperature/'
PATH_PROCESSED_TEMP_DATA_FOLDER = 'data/processed/temperature/'
PATH_COUNTRIES_LAND_FOLDER = 'countries-land/'
PATH_REGIONS_LAND_FOLDER = 'regions-land/'

In [17]:
PATH_COUNTRIES_LIST_FILE = 'temp-countries-list.csv'
PATH_UN_COUNTRY_CODES_FILE = 'data/raw/country-codes/un-country-codes.csv'
PATH_GLOBAL_LAND_OCEAN_FILE = 'global-land-ocean.txt'

In [18]:
if not os.path.isdir("data/processed/temperature/"):
    os.makedirs("data/processed/temperature/")

In [19]:
# Todo: remove loading temp_countries from processed data when combining jupyter notebooks
temp_countries: pd.DataFrame = pd.read_csv(PATH_PROCESSED_TEMP_DATA_FOLDER + PATH_COUNTRIES_LIST_FILE, sep=',')

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/temperature/temp-countries-list.csv'

### Load temp-land-country data

In this section we load Berkleys land-temperature data for each country from the corresponding .txt files and combine them into a single dataframe. We also change the naming of the countries according to the naming standard of the UN and add their ISO3-Country code.

In [None]:
temp_land_country_column_names = ["country_code", "country_name", "year", "month", "monthly_anomaly", "monthly_unc", "annual_anomaly",
                                  "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]

temp_land_country = pd.DataFrame(columns=temp_land_country_column_names)

temp_country_names = temp_countries['Country']

# Todo when combining use the dictionary in the data-integration notebook
new_country_names = {
    "Denmark (Europe)": "Denmark",
    "France (Europe)": "France",
    "Netherlands (Europe)": "Netherlands",
    "United Kingdom (Europe)": "United Kingdom",
    "Åland": "Åland Islands",
    "Czech Republic": "Czechia",
    "Turkey": "Türkiye",
    "Svalbard and Jan Mayen": "Svalbard and Jan Mayen Islands",
    "Cape Verde": "Cabo Verde",
    "Turks and Caicas Islands": "Turks and Caicos Islands",
    "Swaziland": "Eswatini",
    "Macedonia": "North Macedonia",
    "Côte d'Ivoire": "Côte d’Ivoire",
    "Federated States of Micronesia": "Micronesia (Federated States of)",
    "South Georgia and the South Sandwich Isla": "South Georgia and the South Sandwich Islands",
    "Bonaire, Saint Eustatius and Saba": "Bonaire, Sint Eustatius and Saba",
    "Congo (Democratic Republic of the)": "Democratic Republic of the Congo",
    "South Korea": "Korea, South",
    "North Korea": "Korea, North",
    "Palestina": "State of Palestine"
}

def swap_keys_values(d):
    return {v: k for k, v in d.items()}

map_country_to_filename = swap_keys_values(new_country_names)

for temp_country_name in temp_country_names:
    temp_country_file_name = temp_country_name
    if temp_country_name in map_country_to_filename:
        temp_country_file_name = map_country_to_filename[temp_country_name]
    path_temp_country_txt_file = PATH_RAW_TEMP_DATA_FOLDER + PATH_COUNTRIES_LAND_FOLDER + temp_country_file_name + '.txt'
    temp_land_one_country = pd.read_csv(path_temp_country_txt_file, comment="%", header=None, delim_whitespace=True)
    country_iso_name = temp_countries.loc[temp_countries['Country'] == temp_country_name]['ISO-alpha3'].iloc[0]

    temp_land_one_country.insert(0, 'country_code', country_iso_name)
    temp_land_one_country.insert(1, 'country_name', temp_country_name)

    temp_land_one_country.columns=temp_land_country_column_names

    temp_land_country = pd.concat([temp_land_country, temp_land_one_country])

temp_land_country.head()

### Create temp-land-region data from temp-land-country data

Berkleys offers a dataset with regional temperature anomly data, however because we also use datasets for natural disasters and population we have to ensure that regions in all datasets are including the same countries. The regions defined for Berkleys regional temp anomaly datasets differ from the definition of regions in the UN dataset we use, therefore we decided to calculate regional temp anomalies based on the berkleys temp anomaly data for individual countries for regions based on the region data of the UN dataset.

Temperature anomalies tend to be similar across large regions geographical regions, even if the absolute temperature of two different measuring points differs for the same time period, their anomalies tend to be quite similar. To create regional temperature anomaly data we calculate the mean of temperature anomaly measures for all countries in a given region on a monthly basis. Source: https://data.giss.nasa.gov/gistemp/faq/#q101

When computing a region it is possible that some countries are not included, because it is possible that countries started reporting earlier than others or stopped reporting for some time periods.

In [None]:
def calc_region_anomaly(temp_land_country_regions):
    temp_return_land_region = temp_land_country_regions.groupby(['region_code', 'region_name', 'year', 'month'], as_index=False)['monthly_anomaly'].mean()
    temp_return_land_region.rename({ 'monthly_anomaly': 'temperature_anomaly'}, axis=1,inplace=True)
    return temp_return_land_region

In [None]:
# Todo: only load UN country codes once after combining notebook
# load and format un-country-codes data
un_country_codes = pd.read_csv(PATH_UN_COUNTRY_CODES_FILE, sep=";")
un_country_codes = un_country_codes[['Region Code', 'Region Name', 'ISO-alpha3 Code']]
un_country_codes.columns=['region_code', 'region_name', 'country_code']

# Join UN country codes to temp data
# exclude temp data for antarctica when calculating regional temp data
temp_land_country_no_antarctica = temp_land_country[temp_land_country.country_name != 'Antarctica']
temp_land_country_regions = pd.merge(temp_land_country_no_antarctica, un_country_codes, on='country_code', how='left')
temp_land_country_regions.loc[temp_land_country_regions['country_name']=='Taiwan','region_name'] = 'Asia'
temp_land_country_regions.loc[temp_land_country_regions['country_name']=='Taiwan','region_code'] = 142

# calculate anomaly data for regions
temp_land_region = calc_region_anomaly(temp_land_country_regions)

temp_land_region.head()

Load berkleys regional data

We load Berkleys regional temperature anomaly data to compare it to the regional temperature anomaly data we calculated from the country temperature anomaly data and UN regions combined.

We combine Berkleys North and South America region to one Region 'Americas' to be able to compare it to the combined 'Americas' region of the UN data later.

In [None]:
# Todo: make this code segment prettier; reuse other code segments

temp_land_country_column_names = ["region_name", "year", "month", "monthly_anomaly", "monthly_unc", "annual_anomaly", "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]
temp_berkley_regions = ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']
temp_berkleys_land_region = pd.DataFrame(columns=temp_land_country_column_names)

for temp_berkley_region in temp_berkley_regions:
    region_file_name = temp_berkley_region
    path = PATH_RAW_TEMP_DATA_FOLDER + PATH_REGIONS_LAND_FOLDER + region_file_name + '.txt'
    one_region = pd.read_csv(path, comment="%", header=None, delim_whitespace=True)
    one_region.insert(0, 'region_name', temp_berkley_region)
    one_region.columns=temp_land_country_column_names
    temp_berkleys_land_region = pd.concat([temp_berkleys_land_region, one_region])

temp_berkleys_land_region.loc[(temp_berkleys_land_region['region_name'] == 'South America') | (temp_berkleys_land_region['region_name'] == 'North America'), 'region_name'] = 'Americas'
temp_berkleys_land_region = temp_berkleys_land_region[['region_name', 'year', 'month', 'monthly_anomaly']]
temp_berkleys_land_region = temp_berkleys_land_region.groupby(by=['region_name', 'year', 'month'])['monthly_anomaly'].mean()
temp_berkleys_land_region = temp_berkleys_land_region.reset_index()
temp_berkleys_land_region = temp_berkleys_land_region.rename(columns={'monthly_anomaly': 'temperature_anomaly'})

temp_berkleys_land_region.head()

### Load temp-land-ocean-global data

Two versions exist that treat temperature anomalies at locations with sea ice:
1. Anomalies are extrapolated from land-surface air temperature anomalies.
2. Anomalies are extrapolated from sea-surface water temperature anomalies (usually collected from open water areas on the periphery of the sea ice).

We choose to use the air temperature version based on Berkleys remark:
"We believe that the use of air temperatures above sea ice provides a more natural means of describing changes in Earth's surface temperature."

In [None]:
# only include the dataset where anomalies are extrapolated from land-surface air temperature anomalies.
temp_land_ocean_global = pd.read_csv(PATH_RAW_TEMP_DATA_FOLDER + PATH_GLOBAL_LAND_OCEAN_FILE, comment="%", header=None, delim_whitespace=True, engine='python', skipfooter=2079)
temp_land_ocean_global.columns = ["year", "month", "monthly_anomaly", "monthly_unc", "annual_anomaly", "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]
temp_land_ocean_global.head()

### Format the data

We only keep entries after the year 1900 because our research questions focus on the past 100 years.

In [None]:
def cut_before_1900(temp_data):
    return temp_data[temp_data['year'] >= 1900]

We only keep monthly anomaly data and rename the column to temperature_anomaly

In [None]:
def keep_monthly_anomalies(temp_data):
    drop_labels = ["monthly_unc", "annual_anomaly", "annual_unc", "five_year_anomaly", "five_year_unc", "ten_year_anomaly", "ten_year_unc", "twenty_year_anomaly", "twenty_year_unc"]
    temp_data = temp_data.drop(labels=drop_labels, axis=1)
    return temp_data.rename(columns={'monthly_anomaly': 'temperature_anomaly'})

#### Format temp-land-country

In [None]:
temp_land_country = cut_before_1900(temp_land_country)

#### Format temp-land-region and berkleys regional data

In [None]:
temp_land_region = cut_before_1900(temp_land_region)
temp_berkleys_land_region = cut_before_1900(temp_berkleys_land_region)

#### Format temp-land-ocean-global

In [None]:
temp_land_ocean_global = cut_before_1900(temp_land_ocean_global)
temp_land_ocean_global = keep_monthly_anomalies(temp_land_ocean_global)

#### Compare berkleys and our regional data

In [None]:
def temp_calc_region_corr(temp_single_region):
    temp_land_single_region = temp_land_region[(temp_land_region['region_name'] == temp_single_region)].set_index(['year', 'month'])
    temp_berkleys_land_single_region = temp_berkleys_land_region[(temp_berkleys_land_region['region_name'] == temp_single_region)].set_index(['year', 'month'])

    comp_temp_land_africa_joined = temp_berkleys_land_single_region.join(temp_land_single_region, on=['year', 'month'], lsuffix='_berkleys', rsuffix='_self')

    temp_land_region_corr = comp_temp_land_africa_joined['temperature_anomaly_berkleys'].corr(comp_temp_land_africa_joined['temperature_anomaly_self'])

    print('Correlation for region ' + temp_single_region + ' ' + str(temp_land_region_corr))

Correlation of temperature anomalies between our and berkleys regional temp data sets

In [None]:
temp_berkley_regions = ['Africa', 'Asia', 'Europe', 'Americas', 'Oceania']

for temp_berkley_region in temp_berkley_regions:
    temp_calc_region_corr(temp_berkley_region)

### Save the data as csv files

In [None]:
temp_land_country.to_csv(PATH_PROCESSED_TEMP_DATA_FOLDER + 'temp-land-country.csv', index=False)
temp_land_region.to_csv(PATH_PROCESSED_TEMP_DATA_FOLDER + 'temp-land-region.csv', index=False)
temp_land_ocean_global.to_csv(PATH_PROCESSED_TEMP_DATA_FOLDER + 'temp-land-ocean-global.csv', index=False)