# Collective progress

**Example notebook for creating anonymised, collective information on progress**

* Before running this notebook, you need to prepare the data you want to assess. To do so, please use the notebooks  "prepare-PRIMAP-hist-data-for-collective-progress-plots.ipynb" or "prepare-PRIMAP-data.ipynb" in case the data you need is not yet available.
* For testing, some example data is available in the folder "proc_data"
* Enter the name of the file that you wish to use in the second cell and some of the plotting parameters. After that you can run the full notebook with minimal changes. 

In [1]:
# import modules

# system 
import re
import os

# calculation
import pandas as pd
import numpy as np

# plotting
%matplotlib inline
import seaborn
import matplotlib

# global stocktake tools
from gst_tools.make_plots import *
import gst_tools.gst_utils as utils


In [2]:
# USER INPUT

# First, choose which file you want to plot the data for
data_file_name = 'PRIMAP-hist_v2.0_KyotoGHG-AR4-total-excl-LU.csv'
# other options include...
# 'UN-population-data-2017.csv'
# 'PRIMAP-hist_v2.0_Energy-CO2.csv'
# 'PRIMAP-hist_UN-2017_calc__CO2-per-population.csv'
# 'WDI2017_GDP-PPP.csv'

# Second, choose which years you are interested in analysing
years_of_interest = ['1990', '2000', '2016']

# Third, update data description display names!
# TODO - default to automatic if not specified.
variable_name_to_display = 'greenhouse gas emissions (excl. LULUCF)'
data_source_to_display = 'PRIMAP-hist v2.0'

# Save plots?
# Set the following to True if plots should be saved. 
# If False, plots will be shown on screen but not saved to a file.
save_opt = True

In [3]:
# DATA READING AND PREP

# read the data from file 
fname_in = os.path.join('proc-data', data_file_name)
data = pd.read_csv(fname_in)

# Check the data format
if not utils.verify_data_format(data):
    print('WARNING: The data is not correctly formatted! Please check before continuing!')

# extract the key information
variable = data['variable'].unique()[0]
unit = data['unit'].unique()[0]

# tidy up for next stesps
data_years = utils.set_countries_as_index(data)
data_years = data_years.dropna(axis=1, how='any')

# remove comment below to display the data
#data_years

In [4]:
# Plot 1 - make a histogram of absolute data

# how should the xaxis be labelled?
xaxlabel = variable_name_to_display

# make a plot for each year
for selected_year in years_of_interest:
    
    title = 'distribution of ' + variable_name_to_display + ' in ' + str(selected_year)
    
    make_histogram(data_years[selected_year], unit,
                   xlabel=xaxlabel, title=title, 
                   sourcename=data_source_to_display,
                   remove_outliers=True, ktuk=3,
                   save_plot=save_opt, plot_name=(variable + '-' + 'absolute' + '-' + str(selected_year)))


-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 1990, dtype: float64)
upper outliers are: 
country
AUS     423000.0
BRA     606000.0
CAN     607000.0
CHN    3600000.0
DEU    1260000.0
ESP     294000.0
FRA     554000.0
GBR     814000.0
IDN     354000.0
IND    1150000.0
IRN     382000.0
ITA     525000.0
JPN    1270000.0
KAZ     341000.0
KOR     316000.0
MEX     423000.0
NGA     302000.0
POL     477000.0
RUS    3780000.0
UKR     952000.0
USA    6510000.0
ZAF     374000.0
Name: 1990, dtype: float64
---
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2000, dtype: float64)
upper outliers are: 
country
AUS     488000.0
BRA     803000.0
CAN     736000.0
CHN    4890000.0
DEU    1060000.0
ESP     393000.0
FRA     558000.0
GBR     726000.0
IDN     525000.0
IND    1640000.0
IRN     525000.0
ITA     560000.0
JPN    1380000.0
KOR     517000.0
MEX     550000.0
POL     397000.0
RUS    2280000.0
SAU     345000.0
TUR     301000.0
UKR 

In [5]:
# Plot 2 - trends

# Calculate trends from the absolute data
# trends - % change in any given year
# rolling_trends - % annual change averaged over the specifiied number of years (here 5 is recommended)
# trends_unit - unit of the trend, here % change
trends, rolling_trends, trends_unit = utils.calculate_trends(data_years, num_years_trend=5)

# define some labels for the plots
trends_variable = 'average annual change' 
thistitle = "5-year rolling average trend in \n" + variable_name_to_display + "\nin " + str(data_years.columns[-1])

# make a plot showing the trend in the final year of available data
make_histogram(rolling_trends.iloc[:,-1], trends_unit, 
               xlabel=trends_variable,
               title=thistitle, 
               sourcename=data_source_to_display, 
               save_plot=save_opt, plot_name=(variable + '-' + 'rolling-average'))


Averaging trend over 5 years.
bins set to range(-15, 15)


In [6]:
# Plot 3 - change since year X

# run calculations - dataframe of differences in all years relative to the specified year
# the function returns both absolute and relative (%) values
df_abs_diff_1990, df_perc_diff_1990 = utils.calculate_diff_since_yearX(data_years, '1990')
df_abs_diff_2005, df_perc_diff_2005 = utils.calculate_diff_since_yearX(data_years, '2005')

# make plots

# a few selected years; difference from 1990
for selected_year in years_of_interest:
    make_histogram(df_perc_diff_1990[selected_year], "%", 
                   xlabel='change since 1990', 
                   title=('change in ' + variable_name_to_display + '\n from 1990 to ' + str(selected_year)), 
                   sourcename=data_source_to_display,
                   remove_outliers=True, ktuk=3, 
                   save_plot=save_opt, plot_name=(variable + '-' + 'change-since-1990' + '-in-' + str(selected_year)))

    
# decrease in the last year as compared to 2005 
make_histogram(df_perc_diff_2005.iloc[:,-1], '%', 
               xlabel='change since 2005', 
               title=('change in ' + variable_name_to_display + '\n from 2005 to ' + str(df_perc_diff_2005.columns[-1])), 
               sourcename=data_source_to_display,
               remove_outliers=False, 
               save_plot=save_opt, plot_name=(variable + '-' + 'change-since-2005'))

# same plot, but removing outliers to show difference...
make_histogram(df_perc_diff_2005.iloc[:,-1], '%', 
               xlabel='change since 2005', 
               title=('change in ' + variable_name_to_display + '\n from 2005 to ' + str(df_perc_diff_2005.columns[-1])), 
               sourcename=data_source_to_display,
               remove_outliers=True, 
               save_plot=save_opt, plot_name=(variable + '-' + 'change-since-2005-excl-outliers'))


Calculating difference compared to 1990
Calculating difference compared to 2005
---------
All values in the series are the same! Exiting plotting routine for KyotoGHG-AR4-total-excl-LU-change-since-1990-in-1990
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2000, dtype: float64)
upper outliers are: 
country
BOL     188.439306
GNQ    2801.477833
NER     164.248705
SYC     191.517857
TLS     165.700483
ZWE     155.434783
Name: 2000, dtype: float64
---
bins set to range(-156, 156, 12)
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2016, dtype: float64)
upper outliers are: 
country
GNQ    6550.246305
MDV     579.611650
QAT     623.404255
Name: 2016, dtype: float64
---
bins set to range(-533, 533, 41)
bins set to range(-210, 210, 14)
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2016, dtype: float64)
upper outliers are: 
country
NIU    192.957746
SGP    187.234043
Name: 201

## Below here is code for testing and debugging!

In [7]:
# read the data from file 
fname_in = os.path.join('proc-data', data_file_name)
data = pd.read_csv(fname_in)

data.columns

Index(['category', 'country', 'scenario', 'source', 'unit', 'variable', '1990',
       '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
       '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016'],
      dtype='object')

In [8]:
data_years

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AFG,12800.0,13000.0,12100.0,12200.0,12200.0,12800.0,13900.0,14900.0,15700.0,16600.0,...,18500.0,21700.0,25000.0,28900.0,32800.0,32500.0,31000.0,32400.0,32500.0,34500.0
AGO,68800.0,71000.0,73300.0,72700.0,66400.0,77800.0,82000.0,77300.0,80000.0,75900.0,...,84900.0,84500.0,86600.0,92500.0,94900.0,95300.0,92200.0,92700.0,97100.0,98200.0
ALB,6780.0,8090.0,5800.0,5890.0,6450.0,6390.0,6450.0,6520.0,7320.0,7770.0,...,8270.0,8180.0,8660.0,8930.0,9650.0,9290.0,9480.0,10300.0,10000.0,9940.0
AND,476.0,484.0,494.0,504.0,509.0,536.0,567.0,577.0,591.0,609.0,...,630.0,627.0,605.0,608.0,581.0,577.0,565.0,542.0,560.0,554.0
ARE,60200.0,66100.0,67600.0,74900.0,81600.0,83000.0,75800.0,79000.0,101000.0,101000.0,...,210000.0,238000.0,251000.0,245000.0,254000.0,269000.0,264000.0,315000.0,337000.0,347000.0
ARG,243000.0,252000.0,258000.0,261000.0,266000.0,260000.0,273000.0,274000.0,279000.0,281000.0,...,356000.0,352000.0,341000.0,316000.0,325000.0,327000.0,337000.0,329000.0,332000.0,334000.0
ARM,26000.0,24400.0,28700.0,13300.0,13200.0,14000.0,10500.0,10900.0,9460.0,7470.0,...,8900.0,9600.0,8470.0,8500.0,9340.0,10400.0,10200.0,10400.0,11300.0,11500.0
ATG,393.0,381.0,378.0,387.0,386.0,396.0,416.0,432.0,444.0,458.0,...,617.0,629.0,659.0,654.0,649.0,662.0,663.0,678.0,697.0,707.0
AUS,423000.0,424000.0,429000.0,429000.0,429000.0,439000.0,446000.0,458000.0,472000.0,477000.0,...,538000.0,541000.0,545000.0,542000.0,544000.0,547000.0,536000.0,530000.0,541000.0,552000.0
AUT,79500.0,83200.0,76500.0,76500.0,77000.0,80300.0,83700.0,83400.0,82800.0,80800.0,...,87700.0,87500.0,80600.0,85400.0,83000.0,80500.0,80700.0,76900.0,79300.0,80100.0
