In [126]:
import pandas as pd
import xlrd 
import os
from ddf_utils.str import to_concept_id, format_float_digits


In [127]:
from functools import partial

In [128]:
sheet_loader = partial(pd.read_excel, skiprows=2)

In [129]:
number_formatter = partial(format_float_digits, digits=8)

In [130]:
source_file = '../source/bp-stats-review-2020-all-data.xlsx'

In [131]:
countries4 = []

In [132]:
def preprocess4(data,i):
    """preprocessing the data:
    1. rename the first column to geo_name
    2. rename the geo_name to alphanumeric
    3. drop all empty lines and lines after 'total world'

    Note: This function only applies to the tab with country as row index
    and year as column index.
    """
    
    data = data.rename(columns={data.columns[0]: 'geo_name'})
    data['geo'] = data['geo_name'].map(to_concept_id)
    #data = data.dropna(how='all')
    slice_array = data.query('geo == "total_world"').index
    slice_fuel_array = data.query('geo == @i').index
    
    
    #data = data.loc[:'total_world']
    fuel_index = biofuels_index[i]
    
    if (slice_fuel_array.empty):
        data = data.iloc[:slice_array[fuel_index]+1]
    else:
        data = data.iloc[slice_fuel_array[0]+1:slice_array[fuel_index]+1]
    data = data.dropna(how='all')
    data = data.set_index('geo')
    data = data.reset_index()
    return data

In [133]:
def make_dict4(sheet, unit, fuel):
    return dict(sheet=sheet, unit=unit, fuel=fuel)

In [134]:
def process_4(data, ddf_id):
    data = data.dropna(axis=1, how='all')
    data = data.drop('geo_name', axis=1)
    idx = list(data.columns).index(2019)  #TODO: change the year column
    data = data.iloc[:, :idx + 1]  # drop columns after latest year of each sheet.
    #data = data.drop(['2019.1', '2019.2', '2008-18'], axis=1)
    
    data = data.set_index('geo')

    d = data.T.unstack()
    d = d.dropna()
    d = d.reset_index()
    d.columns = ['geo', 'year', ddf_id]
    d[ddf_id] = d[ddf_id].map(number_formatter)

    return d.sort_values(by=['geo', 'year'])

In [135]:
tabs_indicator_mapping = {'production':[
    make_dict4(sheet='Biofuels Production - Kboed', unit='kboed', fuel='biofuel'),
    make_dict4(sheet='Biofuels Production - PJ', unit='petajoules', fuel='biofuel'),
    make_dict4(sheet='Biofuels Production - Kboed', unit='kboed', fuel='biogasoline'),
    make_dict4(sheet='Biofuels Production - PJ', unit='petajoules', fuel='biogasoline'),
    make_dict4(sheet='Biofuels Production - Kboed', unit='kboed', fuel='biodiesel'),
    make_dict4(sheet='Biofuels Production - PJ', unit='petajoules', fuel='biodiesel'),
    
],
                         'consumption':[
    make_dict4(sheet='Biofuels Consumption - Kboed', unit='kboed', fuel='biofuel'),
    make_dict4(sheet='Biofuels Consumption - PJ', unit='petajoules', fuel='biofuel'),
    make_dict4(sheet='Biofuels Consumption - Kboed', unit='kboed', fuel='biogasoline'),
    make_dict4(sheet='Biofuels Consumption - PJ', unit='petajoules', fuel='biogasoline'),
    make_dict4(sheet='Biofuels Consumption - Kboed', unit='kboed', fuel='biodiesel'),
    make_dict4(sheet='Biofuels Consumption - PJ', unit='petajoules', fuel='biodiesel'),
                             
],                             
                             }

In [136]:
biofuels_index = {'biofuel':0,'biogasoline':1,'biodiesel':2}

In [137]:
for i, ms in tabs_indicator_mapping.items():
    
    for m in ms:
        data = []
        d = sheet_loader(source_file, sheet_name=m['sheet'])
        d = preprocess4(d,m['fuel'])
        #print(d.query('geo == "total_world"').index[0])
        #print(biofuels_index[i])
        countries4.append(d['geo_name'].unique())
        indicator_name = m['fuel'] + '_' + i + '_' + m['unit']
        d = process_4(d, indicator_name)
        #d['fuel'] = m['fuel']
        #d['unit'] = m['unit']
        data.append(d)
        data = pd.concat(data, ignore_index=True)
        df = data[['geo', 'year', indicator_name]]
        df[indicator_name] = df[indicator_name].map(number_formatter)
        df = df.sort_values(by=['geo', 'year'])
        df.to_csv('../../ddf--datapoints--{}--by--geo--year.csv'.format(m['fuel'] + '_' + i + '_' + m['unit']), index=False)

In [138]:
#print(countries4)

In [139]:
import numpy as np

In [140]:
c4 = np.concatenate(countries4)

In [141]:
c4 = pd.DataFrame({'name': c4})

In [142]:
c4['name'] = c4['name'].str.strip()

In [143]:
c4 = c4.drop_duplicates(subset='name')
c4['geo'] = c4['name'].map(to_concept_id)

In [144]:
print(c4[['geo', 'name']].to_csv(index=False))

geo,name
canada,Canada
mexico,Mexico
us,US
total_north_america,Total North America
argentina,Argentina
brazil,Brazil
colombia,Colombia
other_s_cent_america,Other S. & Cent. America
total_s_cent_america,Total S. & Cent. America
austria,Austria
belgium,Belgium
finland,Finland
france,France
germany,Germany
italy,Italy
netherlands,Netherlands
poland,Poland
portugal,Portugal
spain,Spain
sweden,Sweden
united_kingdom,United Kingdom
other_europe,Other Europe
total_europe,Total Europe
total_cis,Total CIS
total_middle_east,Total Middle East
total_africa,Total Africa
australia,Australia
china,China
india,India
indonesia,Indonesia
south_korea,South Korea
thailand,Thailand
other_asia_pacific,Other Asia Pacific
total_asia_pacific,Total Asia Pacific
total_world,Total World
canada_mexico,Canada & Mexico
europe,Europe
cis,CIS
middle_east,Middle East
africa,Africa
asia_pacific,Asia Pacific

