# PRIMAP data processor

Unlike the 'PRIMAP-hist' data processor, this script operates on data exported from the PRIMAP emissions module using the write_csv_bulkplus_py function. It is intended primarily for users of the PRIMAP emissions module to make the PRIMAP data accessible for this tool. It was used to pre-process some of the standard datasets available for the gst toolset. 

Primarily it: 
* checks for data completeness and removes unwanted countries
* renames the countryISO column to 'country'
* simplifies and standardises the filename
* reduces the number of years contained (where applicable)
* checks the formatting of the data

In [11]:
# import modules

# system
import re
import sys
import os

# data handling
import pandas as pd
import numpy as np

# open climate data packages
from countrygroups import UNFCCC, EUROPEAN_UNION, ANNEX_ONE, NON_ANNEX_ONE
from shortcountrynames import to_name

# global stocktake tools
import gst_tools.gst_utils as utils

In [1]:
# user options

raw_data_file = "WDI2017P_POP_04-Apr-2019.csv"

# choose something useful! These will be used to generate the new filename.
new_variable_name = 'population'
new_source_name = 'WDI2017'

# Based on countrygroups package, select the group of countries you would like to extract. 
# Note that the raw data also includes groups.
needed_countries = UNFCCC

# First year of data needed for further plotting
start_year = 1990 
 

NameError: name 'UNFCCC' is not defined

In [2]:
# get the data
raw_data_folder = os.path.join('input-data', 'PRIMAP')
fname = os.path.join('', raw_data_folder, raw_data_file)
print('reading ' + fname)
raw_data = pd.read_csv(fname)

# rename some columns
new_data = raw_data.rename(columns={'countryISO': 'country'})

# reduce the countries or regions to only those desired
new_data = new_data.loc[new_data['country'].isin(needed_countries)]

# tell the user if any of the needed countries are missing and, if yes, which ones:
missing_countries = list(set(needed_countries) - set(new_data['country'].unique()))
if missing_countries:
    print('Not all countries requested were available in the raw data. You are missing the following:')
    for country in missing_countries:
        print('   ' + to_name(country))
    print('---------')
                
# reduce to only required years
new_data = utils.change_first_year(new_data, start_year)

# make the columns strings
new_data.columns = new_data.columns.astype(str)

new_data

NameError: name 'os' is not defined

In [27]:
## write the data to file

"""
First ensure that years, unit, 'country', and variable are all in data. If they are
can proceed to print data
"""

# Check the data format
if not utils.verify_data_format(new_data):
    
    print('WARNING: The data is not correctly formatted! Please check your input data and processing!')
    
else:
    
    # define filename as composite of variable and source name
    fname_out = new_source_name + '_' + new_variable_name + '.csv' 
    fullfname_out = os.path.join('proc-data', fname_out)

    # check folder exists
    if not os.path.exists('proc-data'):
        os.makedirs('proc-data')

    # write to csv in proc data folder
    new_data.to_csv(fullfname_out, index=False)

    # celebrate success 
    print('Processed data written to file! - ' + fullfname_out)
    

Processed data written to file! - proc-data/WDI2017_population.csv
