# Collective progress

**Example notebook for creating anonymised, collective information on progress**

* Before running this notebook, you need to prepare the data you want to assess. To do so, please use the notebooks  "prepare-PRIMAP-hist-data-for-collective-progress-plots.ipynb" or "prepare-PRIMAP-data.ipynb" in case the data you need is not yet available.
* For testing, some example data is available in the folder "proc_data"
* Enter the name of the file that you wish to use in the second cell and some of the plotting parameters. After that you can run the full notebook with minimal changes. 

In [1]:
# import modules

# system 
import re
import os

# calculation
import pandas as pd
import numpy as np

# plotting
%matplotlib inline
import seaborn
import matplotlib

# global stocktake tools
from gst_tools.make_plots import *
import gst_tools.gst_utils as utils


In [25]:
# USER INPUT

# First, choose which file you want to plot the data for
data_file_name = 'R-Andrew-2018_cement-CO2.csv'


# other options include...
# 'PRIMAP-hist_v2.0_KyotoGHG-AR4-total-excl-LU.csv'
# 'UN-population-data-2017.csv'
# 'PRIMAP-hist_v2.0_Energy-CO2.csv'
# 'PRIMAP-hist_UN-2017_calc__CO2-per-population.csv'
# 'PRIMAP-hist_UN-2017_calc_CO2-total-excl-LU-per-population.csv'
# 'WDI2017_GDP-PPP.csv'

# Second, choose which years you are interested in analysing
years_of_interest = ['1990', '2005', '2016']

# Third, update data description display names!
# TODO - default to automatic if not specified.
variable_name_to_display = 'Cement CO2'
data_source_to_display = 'R. Andrew, 2018'

# Save plots?
# Set the following to True if plots should be saved. 
# If False, plots will be shown on screen but not saved to a file.
save_opt = True

In [50]:
# DATA READING AND PREP

# read the data from file 
fname_in = os.path.join('proc-data', data_file_name)
data = pd.read_csv(fname_in)

# Check the data format
if not utils.verify_data_format(data):
    print('WARNING: The data is not correctly formatted! Please check before continuing!')

# extract the key information
variable = data['variable'].unique()[0]
unit = data['unit'].unique()[0]

# tidy up for next stesps
data_years = utils.set_countries_as_index(data)
data_years = data_years.dropna(axis=1, how='all')
data_years = data_years.dropna(axis=0, how='any')

# remove comment below to display the data
#data_years

In [51]:
# Plot 1 - make a histogram of absolute data

# how should the xaxis be labelled?
xaxlabel = variable_name_to_display

# make a plot for each year
for selected_year in years_of_interest:
    
    title = 'distribution of ' + variable_name_to_display + ' in ' + str(selected_year)
    
    make_histogram(data_years[selected_year], unit,
                   xlabel=xaxlabel, title=title, 
                   sourcename=data_source_to_display,
                   remove_outliers=True, ktuk=3,
                   save_plot=save_opt, plot_name=(variable + '-' + 'absolute' + '-' + str(selected_year)))


---------
Making  CO2-absolute-1990 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 1990, dtype: float64)
upper outliers are: 
country
BRA    11060.0
CHN    84510.0
DEU    15300.0
ESP    12280.0
FRA    11170.0
IND    22350.0
ITA    15850.0
JPN    38700.0
KOR    17440.0
MEX    12110.0
RUS    34610.0
USA    33480.0
Name: 1990, dtype: float64
---
---------
Making  CO2-absolute-2005 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2005, dtype: float64)
upper outliers are: 
country
BRA     14350.0
CHN    411600.0
EGY     14140.0
ESP     16790.0
IDN     18770.0
IND     60840.0
ITA     17400.0
JPN     32280.0
KOR     22410.0
MEX     18590.0
RUS     22260.0
THA     15910.0
TUR     19120.0
USA     46190.0
Name: 2005, dtype: float64
---
---------
Making  CO2-absolute-2016 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2016, dtype: float64)
upper

In [52]:
# Plot 2 - trends

# Calculate trends from the absolute data
# trends - % change in any given year
# rolling_trends - % annual change averaged over the specifiied number of years (here 5 is recommended)
# trends_unit - unit of the trend, here % change
trends, rolling_trends, trends_unit = utils.calculate_trends(data_years, num_years_trend=5)

# define some labels for the plots
trends_variable = 'average annual change' 
thistitle = "5-year rolling average trend in \n" + variable_name_to_display + "\nin " + str(data_years.columns[-1])

# make a plot showing the trend in the final year of available data
make_histogram(rolling_trends.iloc[:,-1], trends_unit, 
               xlabel=trends_variable,
               title=thistitle,
               remove_outliers=True, ktuk=2,
               sourcename=data_source_to_display, 
               save_plot=save_opt, plot_name=(variable + '-' + 'rolling-average'))


Averaging trend over 5 years.
---------
Making  CO2-rolling-average plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
country
EST   -11.502942
Name: 2017, dtype: float64
upper outliers are: 
country
AFG    24.411672
ETH    18.239834
MWI    27.287851
RWA    15.603777
TJK    54.215560
Name: 2017, dtype: float64
---
bins set to range(-15, 15)


In [53]:
# Plot 3 - change since year X

# run calculations - dataframe of differences in all years relative to the specified year
# the function returns both absolute and relative (%) values
df_abs_diff_1990, df_perc_diff_1990 = utils.calculate_diff_since_yearX(data_years, '1990')
df_abs_diff_2005, df_perc_diff_2005 = utils.calculate_diff_since_yearX(data_years, '2005')

# make plots

# a few selected years; difference from 1990
for selected_year in years_of_interest:
    make_histogram(df_perc_diff_1990[selected_year], "%", 
                   xlabel='change since 1990', 
                   title=('change in ' + variable_name_to_display + '\n from 1990 to ' + str(selected_year)), 
                   sourcename=data_source_to_display,
                   remove_outliers=True, ktuk=3, 
                   save_plot=save_opt, plot_name=(variable + '-' + 'change-since-1990' + '-in-' + str(selected_year)))

    
# decrease in the last year as compared to 2005 
make_histogram(df_perc_diff_2005.iloc[:,-1], '%', 
               xlabel='change since 2005', 
               title=('change in ' + variable_name_to_display + '\n from 2005 to ' + str(df_perc_diff_2005.columns[-1])), 
               sourcename=data_source_to_display,
               remove_outliers=False, 
               save_plot=save_opt, plot_name=(variable + '-' + 'change-since-2005'))

# same plot, but removing outliers to show difference...
make_histogram(df_perc_diff_2005.iloc[:,-1], '%', 
               xlabel='change since 2005', 
               title=('change in ' + variable_name_to_display + '\n from 2005 to ' + str(df_perc_diff_2005.columns[-1])), 
               sourcename=data_source_to_display,
               remove_outliers=True, 
               save_plot=save_opt, plot_name=(variable + '-' + 'change-since-2005-excl-outliers'))


Calculating difference compared to 1990
Calculating difference compared to 2005
---------
Making  CO2-change-since-1990-in-1990 plot.
---------
---------
All values in the series are the same! Exiting plotting routine for CO2-change-since-1990-in-1990
---------
---------
Making  CO2-change-since-1990-in-2005 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2005, dtype: float64)
upper outliers are: 
country
BGD    1360.087083
LBN    2460.000000
UGA    1976.222038
VNM    1092.345437
Name: 2005, dtype: float64
---
bins set to range(-528, 528, 48)
---------
Making  CO2-change-since-1990-in-2016 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2016, dtype: float64)
upper outliers are: 
country
BGD    4884.760522
ETH    1983.573487
LBN    1977.209302
MOZ    1735.240964
NPL    1599.821985
QAT    1795.927602
SDN    1970.008643
UGA    7375.559238
VNM    2772.423945
Name: 2016, dtype: float64
---


## Below here is space for code for testing and debugging!

In [32]:
# show example data
data.columns


Index(['GWP', 'category', 'categoryCode', 'categorySet', 'country',
       'countryName', 'scenario', 'source', 'unit', 'variable', '1990', '1991',
       '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000',
       '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'],
      dtype='object')

In [49]:
data_1 = data
data_1 = data_1.dropna(axis=1, how='all')
data_1 = data_1.dropna(axis=0, how='any')
data_1

Unnamed: 0,category,categoryCode,categorySet,country,countryName,scenario,source,unit,variable,1990,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,CementProduction,IPC2A1,ipcc2006,AFG,Afghanistan,HISTORY,ANDREW2018V2I,kt,CO2,44.94,...,14.04,11.23,14.03,13.91,13.86,19.68,39.28,31.61,31.61,31.61
1,CementProduction,IPC2A1,ipcc2006,AGO,Angola,HISTORY,ANDREW2018V2I,kt,CO2,105.60,...,684.40,693.70,578.40,573.70,612.90,778.40,968.70,987.70,987.70,987.70
3,CementProduction,IPC2A1,ipcc2006,ARE,United arab emirates,HISTORY,ANDREW2018V2I,kt,CO2,1359.00,...,7834.00,6831.00,6453.00,6369.00,5354.00,5467.00,5588.00,5588.00,5588.00,5588.00
4,CementProduction,IPC2A1,ipcc2006,ARG,Argentina,HISTORY,ANDREW2018V2I,kt,CO2,1633.00,...,3925.00,3728.00,4117.00,4549.00,4184.00,4657.00,4497.00,4336.00,4336.00,4336.00
5,CementProduction,IPC2A1,ipcc2006,ARM,Armenia,HISTORY,ANDREW2018V2I,kt,CO2,365.80,...,412.20,240.30,288.40,264.30,277.90,273.30,268.60,265.50,265.50,265.50
6,CementProduction,IPC2A1,ipcc2006,AUS,Australia,HISTORY,ANDREW2018V2I,kt,CO2,3463.00,...,3863.00,3829.00,3549.00,3496.00,3518.00,3294.00,3138.00,3076.00,2931.00,2931.00
7,CementProduction,IPC2A1,ipcc2006,AUT,Austria,HISTORY,ANDREW2018V2I,kt,CO2,2033.00,...,2133.00,1799.00,1622.00,1666.00,1673.00,1656.00,1639.00,1701.00,1729.00,1729.00
8,CementProduction,IPC2A1,ipcc2006,AZE,Azerbaijan,HISTORY,ANDREW2018V2I,kt,CO2,478.00,...,770.80,621.00,617.40,688.10,949.10,1109.00,1386.00,1265.00,1265.00,1265.00
10,CementProduction,IPC2A1,ipcc2006,BEL,Belgium,HISTORY,ANDREW2018V2I,kt,CO2,2824.00,...,3033.00,2795.00,2582.00,2762.00,2643.00,2541.00,2643.00,2348.00,2436.00,2436.00
11,CementProduction,IPC2A1,ipcc2006,BEN,Benin,HISTORY,ANDREW2018V2I,kt,CO2,123.80,...,517.50,506.80,501.80,559.60,531.30,552.40,541.30,698.00,698.00,698.00


In [48]:
data.loc[data['country'] == 'AFG']


Unnamed: 0,GWP,category,categoryCode,categorySet,country,countryName,scenario,source,unit,variable,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,,CementProduction,IPC2A1,ipcc2006,AFG,Afghanistan,HISTORY,ANDREW2018V2I,kt,CO2,...,14.04,11.23,14.03,13.91,13.86,19.68,39.28,31.61,31.61,31.61
