# Calculate indicators

This notebook enables the user to use GDP, population, or other sets of indicators to calculate combined indicators such as per capita emissions or emissions / final energy use.

It should be used on data that is already pre-processed for this toolset to ensure efficiency and avoid errors.

TODO: unit handling!


In [1]:
# import modules

# system
import sys, os, re

# analytic
import pandas as pd
import numpy as np

# open cliamte data
import countrygroups

# plotting
import seaborn
import matplotlib.pyplot as plt

# global stocktake tools
import gst_tools.gst_utils as utils


In [2]:
# User options


data_set_1 = 'PRIMAP-hist_v2.0_Energy-CO2.csv'
data_set_2 = 'UN-2017-population.csv'

# Use this to generate the prefix of the output data file to include the source names of the original data. 
# The entities will automaticall be added when running the script
new_source_name = 'PRIMAP-hist_UN-2017_calc'


In [5]:
# get and clean data 

fname_in1 = os.path.join('proc-data', data_set_1)
fname_in2 = os.path.join('proc-data', data_set_2)

# read in the data
var1 = pd.read_csv(fname_in1)
var2 = pd.read_csv(fname_in2)

# make sure that the same countries and years are available
var1, var2 = utils.ensure_common_years(var1, var2)
var1, var2 = utils.ensure_common_countries(var1, var2)

# check the data format
check1 = utils.verify_data_format(var1)
check2 = utils.verify_data_format(var2)

if not check1 or not check2:
    print('One of the dataframes is not correct! Please check and try again!')
else:
    # get metadata for later use and checking
    var1_name = var1['variable'].unique()[0]
    var2_name = var2['variable'].unique()[0]

    var1_unit  = var1['unit'].unique()[0]
    var2_unit = var2['unit'].unique()[0]


Common countries are: 
['GNB', 'TKM', 'PRK', 'ERI', 'SLV', 'BGR', 'GEO', 'BDI', 'BHS', 'GRC', 'LKA', 'EST', 'SDN', 'ARM', 'GNQ', 'ZMB', 'ISL', 'KEN', 'AFG', 'NLD', 'FRA', 'LVA', 'VNM', 'COD', 'BWA', 'UZB', 'VCT', 'LCA', 'HND', 'NIU', 'LSO', 'PLW', 'BEN', 'TGO', 'MYS', 'SAU', 'TCD', 'DZA', 'NRU', 'KNA', 'SLE', 'SYC', 'MNG', 'LTU', 'GBR', 'BRN', 'RUS', 'CMR', 'FJI', 'SRB', 'KOR', 'LUX', 'SUR', 'BRB', 'CIV', 'TJK', 'USA', 'KIR', 'YEM', 'STP', 'JOR', 'VUT', 'HTI', 'GTM', 'TON', 'NOR', 'DEU', 'DOM', 'GAB', 'HUN', 'PHL', 'MWI', 'MDV', 'TUR', 'WSM', 'LBY', 'AUS', 'EGY', 'PER', 'LBN', 'KWT', 'RWA', 'ECU', 'DJI', 'MNE', 'COM', 'CAF', 'CYP', 'POL', 'GRD', 'ISR', 'MAR', 'QAT', 'BOL', 'GHA', 'MUS', 'BHR', 'CHL', 'SWE', 'TUN', 'THA', 'AUT', 'BEL', 'CUB', 'IRL', 'FSM', 'HRV', 'COK', 'COL', 'FIN', 'PNG', 'SEN', 'URY', 'MCO', 'MOZ', 'ARG', 'MDA', 'PRT', 'IND', 'TUV', 'CAN', 'CZE', 'MRT', 'DNK', 'SWZ', 'ETH', 'AND', 'LAO', 'MDG', 'BRA', 'SMR', 'OMN', 'BLR', 'UKR', 'ZWE', 'AGO', 'JPN', 'MMR', 'SSD', 'CH

In [6]:
# combine data...

# for all of these, it's always var1 divided by var 2 and we want to ensure that this is done on countries. 
# Everything else should be consant across the table

def prep_df_for_division(df):
    
    df = df.set_index('country')
    
    year_cols = [y for y in df[df.columns] if (re.match(r"[0-9]{4,7}$", str(y)) is not None)]
    other_cols = list(set(df.columns) - set(year_cols))
    
    df = df.drop(other_cols, axis='columns')
    
    return df
    
# strip original metadata
var1 = prep_df_for_division(var1)
var2 = prep_df_for_division(var2)

# calculate new variables
new_df = var1 / var2

# generate new metadata
new_variable_name = var1_name + '-per-' + var2_name
new_df['variable'] = new_variable_name
new_df['unit'] = var1_unit + ' / ' + var2_unit

new_df = new_df.reset_index()

# reorganise dataframe
new_df = utils.check_column_order(new_df)


In [7]:
# take a look at your new data frame

new_df

Unnamed: 0,country,unit,variable,1990,1991,1992,1993,1994,1995,1996,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,AFG,Gg / ThousandPers,CO2-per-population,0.208995,0.183166,0.095128,0.085458,0.076054,0.069008,0.062841,...,0.062950,0.084909,0.153513,0.241034,0.293023,0.410656,0.348569,0.314827,0.338848,0.349770
1,AGO,Gg / ThousandPers,CO2-per-population,0.409976,0.395111,0.389410,0.421524,0.272373,0.763894,0.701526,...,1.066014,1.166795,1.139736,1.192929,1.210999,1.222203,1.299004,1.215462,1.244406,0.714304
2,ALB,Gg / ThousandPers,CO2-per-population,0.935561,1.502092,0.956617,0.940562,1.050756,1.023582,0.999279,...,1.227765,1.299643,1.330369,1.458161,1.516736,1.677681,1.476008,1.514229,1.787197,1.549591
3,AND,Gg / ThousandPers,CO2-per-population,7.136436,6.987701,6.843500,6.790113,6.637203,6.938136,7.349285,...,6.914349,6.688195,6.594245,6.275011,6.264136,5.982018,6.065679,6.015745,5.945243,
4,ARE,Gg / ThousandPers,CO2-per-population,23.492426,24.263639,23.338968,25.052041,26.195030,24.134073,18.903003,...,26.325669,24.817726,24.948225,24.261736,21.642708,21.216550,22.246059,21.207464,26.348088,28.074232
5,ARG,Gg / ThousandPers,CO2-per-population,3.055325,3.283734,3.357584,3.312722,3.414538,3.229050,3.472646,...,3.892930,4.178110,4.234519,4.093197,4.196596,4.369026,4.323375,4.348856,4.327442,4.260929
6,ARM,Gg / ThousandPers,CO2-per-population,5.935280,5.591611,7.145326,2.878299,2.938794,3.294645,2.253635,...,1.301335,1.558102,1.788035,1.478233,1.470123,1.742257,2.019486,1.942278,1.947547,2.214642
7,ATG,Gg / ThousandPers,CO2-per-population,4.228140,3.981755,3.858126,3.874861,3.736806,3.735449,3.874226,...,4.916889,5.132358,5.190424,5.449824,5.535543,5.359438,5.414510,5.356559,5.431100,5.564284
8,AUS,Gg / ThousandPers,CO2-per-population,15.198254,15.111273,15.212215,15.265036,15.210200,15.710795,15.871552,...,17.886996,17.854872,17.758000,17.756074,17.224182,16.815097,16.825981,16.241389,15.846869,16.008702
9,AUT,Gg / ThousandPers,CO2-per-population,6.693467,7.127509,6.487094,6.474841,6.404764,6.683253,7.196431,...,7.592277,7.158512,7.075653,6.642742,6.979828,6.654953,6.386815,6.306992,5.826331,6.003233


In [8]:
## write the data to file

"""
First ensure that years, unit, 'country', and variable are all in data. If they are
can proceed to print data
"""

if 'country' not in new_df.columns or 'unit' not in new_df.columns:
    
    print('Missing required information! Please check your input data and processing!')
    
else:
    
    # define filename as composite of variable and source name
    fname_out = new_source_name + '_' + new_variable_name + '.csv' 
    fullfname_out = os.path.join('proc-data', fname_out)

    # check folder exists
    if not os.path.exists('proc-data'):
        os.makedirs('proc-data')

    # write to csv in proc data folder
    new_df.to_csv(fullfname_out, index=False)

    # celebrate success 
    print('Processed data written to file! - ' + fullfname_out)
    

Processed data written to file! - proc-data/PRIMAP-hist_UN-2017_calc_CO2-per-population.csv
