# Data processing

This notebook can be used for processing various data formats found in this folder. 

Some of these datasets are already somewhat edited by hand before being edited here 
(e.g. reformatting to consistent columns). But as minimally as possible.

In [2]:
# import modules

# system
import re
import sys
import os

# data handling
import pandas as pd
import numpy as np

from countrynames import to_code_3

# open climate data packages
from countrygroups import UNFCCC, EUROPEAN_UNION, ANNEX_ONE, NON_ANNEX_ONE
from shortcountrynames import to_name

# global stocktake tools
import gst_tools.gst_utils as utils



ModuleNotFoundError: No module named 'countrynames'

In [None]:
# 1 EIA Energy data

# !!! Currently retains all available countries because EIA data has country names and no ISO codes !!!

# some of the data from the EIA has been pre-processed to an easy to read .csv file. However, a 
# bit more processing is needed to generate separate .csv files for each variable. That is 
# performed by this section of the notebook. It does not need to be repeated but is retained
# here for documentation. 

raw_data_file = "EIA-International_data-energy-production-consumption-by-country.csv"

# first available year is 1980, but more data available later
start_year = 1990

# Based on countrygroups package, select the group of countries you would like to extract. 
# Note that the raw data may also include groups.
needed_countries = UNFCCC
new_source_name = 'EIA'

# get the data
fname = os.path.join('', 'input-data', raw_data_file)
print('reading ' + fname)
raw_data = pd.read_csv(fname)
new_data = raw_data.dropna()

# rename some columns
#new_data = raw_data.rename(columns={'countryISO': 'country'})

# reduce the countries or regions to only those desired
# and tell the user which ones are being removed

# rename countries to ISO-3 codes
#for country in new_data['country']: countrynames.to_code_3(country)
new_data['country'] = new_data['country'].apply(to_code_3)
all_countries = new_data['country'].unique()
removed_countries = list(set(all_countries) - set(needed_countries))
if removed_countries:
    print('Some countries being trimmed from dataset:')
    for country in removed_countries:
        if country:
            print('   ' + to_name(country))
    print('---------')
new_data = new_data.loc[new_data['country'].isin(needed_countries)]

# tell the user if any of the needed countries are missing and, if yes, which ones:
missing_countries = list(set(needed_countries) - set(new_data['country'].unique()))
if missing_countries:
    print('Not all countries requested were available in the raw data. You are missing the following:')
    for country in missing_countries:
        print('   ' + to_name(country))
    print('---------')
    
# Check for available variables and sectors
variables = new_data['variable'].unique()
fuels = new_data['fuel'].unique()
  
# make a new file with each one...

for var in variables:
    for fuel in fuels:
        
        print('getting data for ' + var + ' and ' + fuel)
        
        data_selected = new_data.loc[(new_data['variable'] == var) &
                                     (new_data['fuel'] == fuel)]
        
        # Check the data format
        if not utils.verify_data_format(data_selected):

            print('WARNING: The data is not correctly formatted! Please check your input data and processing!')

        else:

            # define the variable name
            new_variable_name = (var + '-' + fuel)
            new_variable_name = new_variable_name.replace(' ', '-').lower()
            data_selected['variable'] = new_variable_name
            
            # make nans were appropriate
            data_selected = data_selected.replace('(s)','nan')
            data_selected = data_selected.replace('--','nan')
            
            data_selected = utils.change_first_year(data_selected, start_year)
            
            # make column names strings
            data_selected.columns = data_selected.columns.astype(str)
            
            # define filename as composite of variable and source name
            fname_out = new_source_name + '_' + new_variable_name + '.csv' 
            fullfname_out = os.path.join('proc-data', fname_out)

            # check folder exists
            if not os.path.exists('proc-data'):
                os.makedirs('proc-data')

            # write to csv in proc data folder
            data_selected.to_csv(fullfname_out, index=False)

            # celebrate success 
            print('Processed data written to file! - ' + fullfname_out)


# reduce to only required years
#new_data = utils.change_first_year(new_data, start_year)

# make the columns strings
data_selected

In [None]:
removed_countries

