In [1]:
import pandas as pd
import xlrd 
import os
from ddf_utils.str import to_concept_id, format_float_digits

In [2]:
from functools import partial

In [3]:
sheet_loader = partial(pd.read_excel, skiprows=2)

In [4]:
number_formatter = partial(format_float_digits, digits=8)

In [5]:
source_file = '../source/bp-stats-review-2019-all-data.xlsx'

In [6]:
sheets = xlrd.open_workbook(source_file, on_demand=True).sheet_names()

In [7]:
sheets

['Contents',
 'Primary Energy Consumption',
 'Primary Energy - Cons by fuel',
 'Primary Energy - Cons capita',
 'Oil - Proved reserves',
 'Oil - Proved reserves history',
 'Oil Production - Barrels',
 'Oil Production - Tonnes',
 'Oil Production - Crude Conds',
 'Oil Production - NGLs',
 'Oil Consumption - Barrels',
 'Oil Consumption - Tonnes',
 'Oil Consumption - Mtoe',
 'Oil - Regional Consumption ',
 'Oil - Spot crude prices',
 'Oil - Crude prices since 1861',
 'Oil - Refinery throughput',
 'Oil - Refining capacity',
 'Oil - Regional refining margins',
 'Oil - Trade movements',
 'Oil - Inter-area movements ',
 'Oil - Trade 2017 - 2018',
 'Gas - Proved reserves',
 'Gas - Proved reserves history ',
 'Gas Production - Bcm',
 'Gas Production - Bcf',
 'Gas Production - Mtoe',
 'Gas Consumption - Bcm',
 'Gas Consumption - Bcf',
 'Gas Consumption - Mtoe',
 'Gas - Prices ',
 'Gas - Inter-regional trade',
 'Gas - LNG imports',
 'Gas - LNG exports',
 'Gas - Trade movts LNG',
 'Gas - Trade movt

In [8]:
contents = pd.read_excel(source_file, sheet_name='Contents', header=None)

In [9]:
contents

Unnamed: 0,0
0,BP Statistical Review of World Energy June 2019
1,This workbook contains information presented ...
2,"BP Statistical Review of World Energy, which ..."
3,internet at:
4,http://www.bp.com/statisticalreview
5,Please use the contents or the tabs at the bot...
6,Primary Energy: Consumption - Mtoe (from 1965)
7,Primary Energy: Consumption by fuel type - Mto...
8,Primary Energy: Consumption per capita - Gigaj...
9,Oil: Proved reserves


In [10]:
tabs_to_parse = contents.loc[6:77, 0]

In [11]:
tabs_to_parse

6        Primary Energy: Consumption - Mtoe (from 1965)
7     Primary Energy: Consumption by fuel type - Mto...
8     Primary Energy: Consumption per capita - Gigaj...
9                                  Oil: Proved reserves
10           Oil: Proved reserves - Barrels (from 1980)
11                Oil: Production - Barrels (from 1965)
12                 Oil: Production - Tonnes (from 1965)
13    Oil: Crude oil and condensate production - Bar...
14    Oil: Natural Gas Liquids production - Barrels ...
15               Oil: Consumption - Barrels (from 1965)
16                Oil: Consumption - Tonnes (from 1965)
17                  Oil: Consumption - Mtoe (from 1965)
18    Oil: Regional consumption - by product - Barre...
19                              Oil: Spot crude prices 
20                         Oil: Crude prices since 1861
21                 Oil: Refinery throughput (from 1980)
22                   Oil: Refining capacity (from 1965)
23           Oil: Regional refining margins (fro

In [12]:
len(sheets[1:-2])

72

In [13]:
tabs = pd.DataFrame({'full_name': tabs_to_parse, 'tab_name': sheets[1:-2]})

In [14]:
tabs

Unnamed: 0,full_name,tab_name
6,Primary Energy: Consumption - Mtoe (from 1965),Primary Energy Consumption
7,Primary Energy: Consumption by fuel type - Mto...,Primary Energy - Cons by fuel
8,Primary Energy: Consumption per capita - Gigaj...,Primary Energy - Cons capita
9,Oil: Proved reserves,Oil - Proved reserves
10,Oil: Proved reserves - Barrels (from 1980),Oil - Proved reserves history
11,Oil: Production - Barrels (from 1965),Oil Production - Barrels
12,Oil: Production - Tonnes (from 1965),Oil Production - Tonnes
13,Oil: Crude oil and condensate production - Bar...,Oil Production - Crude Conds
14,Oil: Natural Gas Liquids production - Barrels ...,Oil Production - NGLs
15,Oil: Consumption - Barrels (from 1965),Oil Consumption - Barrels


In [15]:
tabs.to_csv('tabs.csv', index=False)

In [16]:
# Plan:
# production, consumption: create indicators with country/year/fuel/unit dimension (because they have different units)

In [17]:
def preprocess(data):
    """preprocessing the data:
    1. rename the first column to geo_name
    2. rename the geo_name to alphanumeric
    3. drop all empty lines and lines after 'total world'

    Note: This function only applies to the tab with country as row index
    and year as column index.
    """
    data = data.rename(columns={data.columns[0]: 'geo_name'})
    data['geo'] = data['geo_name'].map(to_concept_id)
    data = data.set_index('geo')
    data = data.dropna(how='all')
    data = data.loc[:'total_world']
    data = data.reset_index()
    return data

In [18]:
def apply_map(x, m):
    """general function to replace value in a Series."""
    if x in m:
        return m[x]
    return x

there are a few formats in the excel, we will create handler for each format

```
indicators with only geo, time dimension:

Primary Energy: Consumption - Mtoe (from 1965),Primary Energy Consumption
Carbon Dioxide Emissions (from 1965),Carbon Dioxide Emissions
Oil: Refinery throughput (from 1980),Oil - Refinery throughput
Oil: Refining capacity (from 1965),Oil - Refining capacity
Electricity generation - TWh (from 1985),Electricity Generation
Primary Energy: Consumption per capita - Gigajoule per capita (from 1965),Primary Energy - Cons capita


geo/time/fuel, indicators with unit: 

Oil: Production - Barrels (from 1965),Oil Production - Barrels
Oil: Production - Tonnes (from 1965),Oil Production - Tonnes
Oil: Consumption - Barrels (from 1965),Oil Consumption - Barrels
Oil: Consumption - Tonnes (from 1965),Oil Consumption - Tonnes
Oil: Consumption - Mtoe (from 1965),OIl Consumption - Mtoe
Oil: Refinery throughput (from 1980),Oil - Refinery throughput
Oil: Refining capacity (from 1965),Oil - Refining capacity
Gas: Proved reserves - Bcm (from 1980),Gas - Proved reserves history 
Gas: Production - Bcm (from 1970),Gas Production - Bcm
Gas: Production - Bcf (from 1970),Gas Production - Bcf
Gas: Production - Mtoe (from 1970),Gas Production - Mtoe
Gas: Consumption - Bcm (from 1965),Gas Consumption - Bcm
Gas: Consumption - Bcf (from 1965),Gas Consumption - Bcf
Gas: Consumption - Mtoe (from 1965),Gas Consumption - Mtoe
Coal: Production - Tonnes (from 1981),Coal Production - Tonnes
Coal: Production - Mtoe (from 1981),Coal Production - Mtoe
Coal: Consumption - Mtoe (from 1965),Coal Consumption - Mtoe
Nuclear Energy - Generation - TWh (from 1965),Nuclear Generation - TWh
Nuclear Energy - Consumption - Mtoe (from 1965),Nuclear Consumption - Mtoe
Hydroelectricity - Generation - TWh (from 1965),Hydro Generation - TWh
Hydroelectricity - Consumption - Mtoe (from 1965),Hydro Consumption - Mtoe
Renewables - Other renewables generation -Twh (from 1965),Other renewables - TWh
Renewables - Other renewables consumption - Mtoe (from 1965),Other renewables - Mtoe
Renewables - Solar generation - TWh (from 1965),Solar Consumption - TWh
Renewables - Solar consumption - Mtoe (from 1965),Solar Consumption - Mtoe
Renewables - Wind generation - TWh (from 1965),Wind Consumption - TWh 
Renewables - Wind consumption - Mtoe (from 1965),Wind Consumption - Mtoe
"Renewables - Geothermal, Biomass and Other generation - TWh  (from 1965)",Geo Biomass Other - TWh
"Renewables - Geothermal, Biomass and Other - Mtoe  (from 1965)",Geo Biomass Other - Mtoe
Renewables - Biofuels production - Kboe/d (from 1990),Biofuels Production - Kboed
Renewables - Biofuels production - Ktoe (from 1990),Biofuels Production - Ktoe

Electricity generation from oil -TWh (from 1985),Elec Gen from Oil
Electricity generation from gas - TWh (from 1985),Elec Gen from Gas
Electricity generation from coal - TWh (from 1985),Elec Gen from Coal
Electricity generation from other - TWh (from 1985),Elec Gen from Other

Production - Reserves by key materials:

Key materials - Cobalt Production - Reserves (from 1995),Cobalt Production-Reserves
Key materials - Lithium Production - Reserves (from 1995),Lithium Production-Reserves
Key materials - Graphite Production - Reserves (from 1995),Graphite Production-Reserves
Key materials - Rare Earth Production - Reserves (from 1995),Rare Earth Production-Reserves
Key materials - Cobalt and Lithium Prices,Cobalt and Lithium - Prices

Regional consumption - by product:

Oil: Regional consumption - by product - Barrels (from 1965),Oil - Regional Consumption 


history prices:

Oil: Spot crude prices ,Oil - Spot crude prices
Oil: Crude prices since 1861,Oil - Crude prices since 1861
Gas: Prices ,Gas - Prices 
Coal: Prices,Coal - Prices


class 4:

Oil: Regional refining margins (from 1992),Oil - Regional refining margins

class 5:

Oil: Trade movements (from 1980),Oil - Trade movements
Oil: Inter-area movements ,Oil - Inter-area movements
Oil: Trade 2016-2017,Oil - Trade 2016- 2017
Gas: Trade movements pipeline,Gas - Trade - pipeline
Gas: Trade movements LNG,Gas - Trade movements LNG
Gas: Trade 2016-2017,Gas - Trade 2016-2017


```

In [19]:
def process_1(data, ddf_id):
    data = data.dropna(axis=1, how='all')
    data = data.drop('geo_name', axis=1)
    idx = list(data.columns).index(2018)  #TODO: change the year column
    data = data.iloc[:, :idx + 1]  # drop columns after latest year of each sheet.
    # data = data.drop(['2017.1', '2017.2', '2006-16'], axis=1)
    
    data = data.set_index('geo')

    d = data.T.unstack()
    d = d.dropna()
    d = d.reset_index()
    d.columns = ['geo', 'year', ddf_id]
    d[ddf_id] = d[ddf_id].map(number_formatter)

    return d.sort_values(by=['geo', 'year'])

In [20]:
tabs.iloc[0]

full_name    Primary Energy: Consumption - Mtoe (from 1965)
tab_name                         Primary Energy Consumption
Name: 6, dtype: object

In [97]:
data1 = sheet_loader(source_file, sheet_name='Primary Energy Consumption')

In [98]:
data1 = preprocess(data1)

In [99]:
data1.head()

Unnamed: 0,geo,geo_name,1965,1966,1967,1968,1969,1970,1971,1972,...,2014,2015,2016,2017,2018,2018.1,2007-17,2018.2,Unnamed: 58,Unnamed: 59
0,canada,Canada,115.910849,122.991657,129.026541,137.695457,145.303023,155.587992,159.794037,170.69824,...,341.813863,339.043046,338.223484,343.696739,344.424157,0.002116,0.006524,0.024841,,
1,mexico,Mexico,24.972964,26.432788,26.922222,29.291127,32.196346,34.320386,36.094722,39.916724,...,184.141764,183.970404,186.358373,189.300608,186.91382,-0.012608,0.012453,0.013481,,
2,us,US,1249.642424,1320.03795,1365.733676,1448.932114,1526.191928,1577.850993,1610.270681,1692.88818,...,2232.931428,2213.209941,2212.662102,2222.480694,2300.640296,0.035168,-0.004035,0.165933,,
3,total_north_america,Total North America,1390.526237,1469.462395,1521.68244,1615.918698,1703.691297,1767.759372,1806.15944,1903.503145,...,2758.887056,2736.22339,2737.243959,2755.478041,2831.978273,0.027763,-0.001726,0.204255,,
4,argentina,Argentina,26.914442,27.809713,28.635373,29.623572,30.644712,28.937326,30.747979,31.299226,...,84.127455,86.0989,85.917292,86.103674,85.052439,-0.012209,0.016172,0.006134,,


In [100]:
process_1(data1, to_concept_id('Primary Energy: Consumption - Mtoe')).head()

Unnamed: 0,geo,year,primary_energy_consumption_mtoe
3451,algeria,1965,2.12252123
3452,algeria,1966,2.55687492
3453,algeria,1967,2.41659511
3454,algeria,1968,2.59205343
3455,algeria,1969,2.87406513


In [101]:
df = process_1(data1, to_concept_id('Primary Energy: Consumption - Mtoe'))

In [104]:
data1['geo_name'].unique()

array(['Canada', 'Mexico', 'US', 'Total North America', 'Argentina',
       'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Peru',
       'Trinidad & Tobago', 'Venezuela', 'Central America',
       'Other Caribbean', 'Other South America',
       'Total S. & Cent. America', 'Austria', 'Belgium', 'Bulgaria',
       'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'Estonia',
       'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland',
       'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg',
       'Netherlands', 'North Macedonia', 'Norway', 'Poland', 'Portugal',
       'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden',
       'Switzerland', 'Turkey', 'Ukraine', 'United Kingdom',
       'Other Europe', 'Total Europe', 'Azerbaijan', 'Belarus',
       'Kazakhstan', 'Russian Federation', 'Turkmenistan', 'USSR',
       'Uzbekistan', 'Other CIS', 'Total CIS', 'Iran', 'Iraq', 'Israel',
       'Kuwait', 'Oman', 'Qatar', 'Saudi Arabia', 'United Arab Emirates',
       'Other Middle

In [26]:
# df.to_csv('../../ddf--datapoints--primary_energy_consumption_mtoe--by--geo--year.csv', index=False)

In [27]:
countries = []

In [28]:
tabs_indicator_mapping1 = {
    'Primary Energy Consumption': to_concept_id('Primary Energy Consumption'),
    'Primary Energy - Cons capita': to_concept_id('Primary Energy Consumption per capita'),
    'Carbon Dioxide Emissions': to_concept_id('Carbon Dioxide Emissions'),
    'Oil - Refinery throughput': to_concept_id('Oil - Refinery throughput'),
    'Oil - Refining capacity': to_concept_id('Oil - Refining capacity'),
    'Oil - Proved reserves history': to_concept_id('Oil - Proved reserves'),
    'Gas - Proved reserves history ': to_concept_id('Gas - Proved reserves'),
    'Electricity Generation ': to_concept_id('Electricity Generation')
}

In [29]:
for t, indicator in tabs_indicator_mapping1.items():
    data = sheet_loader(source_file, sheet_name=t)
    data = preprocess(data)
    countries.append(data['geo_name'].unique())
    df = process_1(data, indicator)
    df.to_csv('../../ddf--datapoints--{}--by--geo--year.csv'.format(indicator), index=False)

In [30]:
def make_dict1(sheet, unit, fuel):
    return dict(sheet=sheet, unit=unit, fuel=fuel)

In [31]:
tabs_indicator_mapping2 = {
    'fuel_production': [
        make_dict1(sheet='Gas Production - Bcm', unit='bcm', fuel='gas'),
        make_dict1(sheet='Gas Production - Bcf', unit='bcf', fuel='gas'),
        make_dict1('Gas Production - Mtoe', 'mtoe', 'gas'),
        make_dict1('Coal Production - Tonnes', 'tonne', 'coal'),
        make_dict1('Coal Production - Mtoe', 'mtoe', 'coal'),
        make_dict1('Biofuels Production - Kboed', 'kboed', 'biofuel'),
        make_dict1('Biofuels Production - Ktoe', 'ktoe', 'biofuel'),
        make_dict1('Oil Production - Barrels', 'barrel', 'oil'),
        make_dict1('Oil Production - Tonnes', 'tonne', 'oil')
    ],
    'fuel_consumption': [
        make_dict1('Gas Consumption - Bcm', 'bcm', 'gas'),
        make_dict1('Gas Consumption - Bcf', 'bcf', 'gas'),
        make_dict1('Gas Consumption - Mtoe', 'mtoe', 'gas'),
        make_dict1('Coal Consumption - Mtoe', 'mtoe', 'coal'),
        make_dict1('Nuclear Consumption - Mtoe', 'mtoe', 'nuclear'),
        make_dict1('Hydro Consumption - Mtoe', 'mtoe', 'hydro'),
        # make_dict1('Other renewables - Mtoe', 'mtoe', 'other_renewables'),
        make_dict1('Solar Consumption - Mtoe', 'mtoe', 'solar'),
        make_dict1('Wind Consumption - Mtoe', 'mtoe', 'wind'),
        make_dict1('Geo Biomass Other - Mtoe', 'mtoe', 'geo_biomass_other'),
        make_dict1('Oil Consumption - Barrels', 'barrel', 'oil'),
        make_dict1('Oil Consumption - Tonnes', 'tonne', 'oil'),
        make_dict1('Oil Consumption - Mtoe', 'mtoe', 'oil')
    ],
    'electricity_generation':[
        make_dict1('Elec Gen from Oil', 'twh', 'oil'),
        make_dict1('Elec Gen from Gas', 'twh', 'gas'),
        make_dict1('Elec Gen from Coal', 'twh', 'coal'),
        make_dict1('Elec Gen from Other', 'twh', 'other'),
        make_dict1('Nuclear Generation - TWh', 'twh', 'nuclear'),
        make_dict1('Hydro Generation - TWh', 'twh', 'hydro'),
        # make_dict1('Other renewables - TWh', 'twh', 'other_renewables'),
        make_dict1('Solar Generation - TWh', 'twh', 'solar'),
        make_dict1('Wind Generation - TWh ', 'twh', 'wind'),
        make_dict1('Geo Biomass Other - TWh', 'twh', 'geo_biomass_other')
    ]
}

In [32]:
for i, ms in tabs_indicator_mapping2.items():
    data = []
    for m in ms:
        d = sheet_loader(source_file, sheet_name=m['sheet'])
        d = preprocess(d)
        countries.append(d['geo_name'].unique())
        d = process_1(d, i)
        d['fuel'] = m['fuel']
        d['unit'] = m['unit']
        data.append(d)
    data = pd.concat(data, ignore_index=True)
    df = data[['geo', 'fuel', 'year', 'unit', i]]
    df[i] = df[i].map(number_formatter)
    df = df.sort_values(by=['fuel', 'unit', 'geo', 'year'])
    df.to_csv('../../ddf--datapoints--{}--by--geo--fuel--year--unit.csv'.format(i), index=False)

In [33]:
import numpy as np

In [34]:
countries = np.concatenate(countries)

In [105]:
countries

array(['Canada', 'Mexico', 'US', ..., 'Other Asia Pacific',
       'Total Asia Pacific', 'Total World'], dtype=object)

In [35]:
c1 = pd.DataFrame({'name': countries})

In [36]:
c1['name'] = c1['name'].str.strip()

In [37]:
c1 = c1.drop_duplicates(subset='name')

In [38]:
c1['geo'] = c1['name'].map(to_concept_id)

In [39]:
c1 = c1[['geo', 'name']].sort_values(by='geo')

In [106]:
c1.head()

Unnamed: 0,geo,name
70,algeria,Algeria
516,angola,Angola
4,argentina,Argentina
80,australia,Australia
16,austria,Austria


In [107]:
c1[c1['name'] == 'North Macedonia']

Unnamed: 0,geo,name
36,north_macedonia,North Macedonia


In [41]:
!open ../../

In [42]:
# Reserves by key materials

In [43]:
def preprocess_2(data):
    """preprocessing the data:
    1. rename the first column to geo_name
    2. rename the geo_name to alphanumeric
    3. drop all empty lines and lines after 'total world'

    Note: This function only applies to the tab with country as row index
    and year as column index.
    """
    data = data.rename(columns={data.columns[0]: 'geo_name'})
    data['geo'] = data['geo_name'].map(to_concept_id)
    data = data.set_index('geo')
    data = data.dropna(how='all')
    if 'world' in data.index:
        data = data.loc[:'world']
    else:
        data = data.loc[:'total_world']
    data = data.reset_index()
    return data

def process_2(data, ddf_id):
    data = data.dropna(axis=1, how='all')
    data = data.drop('geo_name', axis=1)
    idx = list(data.columns).index(2018)  # TODO
    data = data.iloc[:, :idx + 1]  # drop columns after latest year of each sheet.
    # data = data.drop(['2017.1', '2017.2', '2006-16'], axis=1)
    
    data = data.set_index('geo')

    d = data.T.unstack()
    d = d.dropna()
    d = d.reset_index()
    d.columns = ['geo', 'year', ddf_id]

    return d.sort_values(by=['geo', 'year'])

In [44]:
countries2 = []

In [45]:
def create_datapoint_2(data_tabs, fuels, indicator):
    data = []

    for t, f in zip(data_tabs, fuels):
        d = sheet_loader(source_file, sheet_name=t)
        d = preprocess_2(d)
        countries2.append(d['geo_name'].unique())
        d = process_2(d, indicator)
        d['material'] = f
        data.append(d)
    data = pd.concat(data, ignore_index=True)
    df = data[['geo', 'material', 'year', indicator]]
    df[indicator] = df[indicator].map(number_formatter)
    # df.to_csv('../../ddf--datapoints--{}--by--geo--mineral--year.csv'.format(indicator), index=False)
    
    return df

In [46]:
data_tabs = ['Cobalt Production-Reserves', 
             'Lithium Production-Reserves',
             'Graphite Production-Reserves',
             'Rare Earth Production-Reserves'
            ]
fuels  = ['cobalt', 'lithium', 'graphite', 'rare_earth']
indicator = 'production_reserve'

In [47]:
df = create_datapoint_2(data_tabs, fuels, indicator)

In [48]:
countries2 = np.concatenate(countries2)

In [49]:
c2 = pd.DataFrame({'name': countries2})

In [50]:
c2['name'] = c2['name'].str.strip()

In [51]:
c2 = c2.drop_duplicates(subset='name')

In [52]:
c2['geo'] = c2['name'].map(to_concept_id)

In [53]:
c2[~c2.geo.isin(c1.geo)]

Unnamed: 0,name,geo
2,DR Congo,dr_congo
3,Cuba,cuba
4,Madagascar,madagascar
6,New Caledonia,new_caledonia
11,Zambia,zambia
12,Rest of World,rest_of_world
24,Brazil1,brazil1
27,India2,india2
30,Mozambique,mozambique
35,Rest of World3,rest_of_world3


In [54]:
df.loc[df.geo == 'brazil1', 'geo'] = 'brazil'
df.loc[df.geo == 'india2', 'geo'] = 'india'
df.loc[df.geo == 'rest_of_world2', 'geo'] = 'rest_of_world'
df.loc[df.geo == 'rest_of_world3', 'geo'] = 'rest_of_world'

In [55]:
df.loc[df.geo == 'world', 'geo'] = 'total_world'

In [56]:
# df = df[~df.geo.isin(['rest_of_world'])]

In [57]:
df.sample(10)

Unnamed: 0,geo,material,year,production_reserve
458,rest_of_world,lithium,2006,0.707
474,total_world,lithium,2005,21.0205083
304,argentina,lithium,1996,0.008
184,rest_of_world,cobalt,1996,2.204
734,russian_federation,graphite,2018,17.0
653,madagascar,graphite,2016,9.23
509,us,lithium,2016,0.9
915,malaysia,rare_earth,2002,0.24
596,china,graphite,2007,800.0
333,australia,lithium,2001,2.22578906


In [58]:
df.geo.unique() 

array(['australia', 'canada', 'cuba', 'dr_congo', 'madagascar', 'morocco',
       'new_caledonia', 'papua_new_guinea', 'philippines',
       'rest_of_world', 'russian_federation', 'south_africa',
       'total_world', 'zambia', 'argentina', 'brazil', 'chile', 'china',
       'portugal', 'us', 'zimbabwe', 'india', 'mexico', 'mozambique',
       'sri_lanka', 'ukraine', 'malaysia', 'thailand'], dtype=object)

In [59]:
(df.sort_values(by=['material', 'geo', 'year'])
 .to_csv('../../ddf--datapoints--production_reserve--by--geo--material--year.csv', index=False))

In [60]:
sheet_loader2 = partial(pd.read_excel, skiprows=3)

In [61]:
countries3 = []

In [62]:
def create_datapoint_3(data_tabs, fuels, indicator):
    data = []

    for t, f in zip(data_tabs, fuels):
        d = sheet_loader2(source_file, sheet_name=t)
        d = preprocess_2(d)
        countries3.append(d['geo_name'].unique())
        d = process_2(d, indicator)
        d['renewable'] = f
        data.append(d)
    data = pd.concat(data, ignore_index=True)
    df = data[['geo', 'renewable', 'year', indicator]]
    df[indicator] = df[indicator].map(number_formatter)
    # df.to_csv('../../ddf--datapoints--{}--by--geo--mineral--year.csv'.format(indicator), index=False)
    
    return df

In [63]:
data_tabs = ['Geothermal Capacity', 
             'Solar Capacity',
             'Wind Capacity'
            ]
fuels  = ['geothermal', 'solar', 'wind']
indicator = 'installed_capacity'

In [64]:
df = create_datapoint_3(data_tabs, fuels, indicator)

In [65]:
df.sample(10)

Unnamed: 0,geo,renewable,year,installed_capacity
572,total_cis,geothermal,2003,70.0
421,other_asia_pacific,geothermal,1995,0.0
1428,other_middle_east,solar,1998,0.0
1303,netherlands,solar,2011,149.0
1962,turkey,solar,2003,1.0
2485,india,wind,2016,28700.4
1093,honduras,solar,2008,3.2
2533,italy,wind,1998,197.0
2006,united_arab_emirates,solar,2001,0.0
1880,total_middle_east,solar,2013,617.6


In [66]:
c3 = np.concatenate(countries3)

In [67]:
c3 = pd.DataFrame({'name': c3})

In [68]:
c3['name'] = c3['name'].str.strip()

In [69]:
c3 = c3.drop_duplicates(subset='name')
c3['geo'] = c3['name'].map(to_concept_id)

In [70]:
print(c3[~c3.geo.isin(c1.geo)][['geo', 'name']].to_csv(index=False))

geo,name
costa_rica,Costa Rica
el_salvador,El Salvador
guadeloupe,Guadeloupe
guatemala,Guatemala
honduras,Honduras
nicaragua,Nicaragua
ethiopia,Ethiopia
kenya,Kenya
jordan,Jordan
uruguay,Uruguay
russian_fed,Russian Fed



In [71]:
df.loc[df.geo == 'russian_fed', 'geo'] = 'russian_federation'

In [72]:
(df.sort_values(by=['renewable', 'geo', 'year'])
 .to_csv('../../ddf--datapoints--{}--by--geo--renewable--year.csv'.format(indicator), index=False))

In [73]:
m2 = {'brazil1': 'brazil', 'india2': 'india', 
      'rest_of_world2': 'rest_of_world', 
      'rest_of_world3': 'rest_of_world'}
c2['geo'] = c2['geo'].map(partial(apply_map, m=m2))

In [74]:
m3 = {'russian_fed': 'russian_federation'}
c3['geo'] = c3['geo'].map(partial(apply_map, m=m3))

In [108]:
geo_ent = pd.concat([c1, c2, c3], ignore_index=True, sort=True)

In [110]:
geo_ent[geo_ent['name'] == 'North Macedonia']

Unnamed: 0,geo,name
62,north_macedonia,North Macedonia


In [111]:
(geo_ent
 .drop_duplicates(subset='geo')
 .sort_values(by='name')
 .to_csv('../../ddf--entities--geo.csv', index=False))

In [77]:
units = ['Bcm', 'Bcf', 'Mtoe', 'Barrel', 'Tonne', 'TWh', 'Kboed', 'Ktoe']

In [78]:
units_df = pd.DataFrame({'unit': list(map(to_concept_id, units)), 'name': units})

In [79]:
units_df

Unnamed: 0,unit,name
0,bcm,Bcm
1,bcf,Bcf
2,mtoe,Mtoe
3,barrel,Barrel
4,tonne,Tonne
5,twh,TWh
6,kboed,Kboed
7,ktoe,Ktoe


In [80]:
units_df.to_csv('../../ddf--entities--unit.csv', index=False)

In [81]:
# 

In [112]:
!cp fuel.csv ../../ddf--entities--fuel.csv

In [83]:
materials = pd.DataFrame({'material': ['cobalt', 'lithium', 'graphite', 'rare_earth'], 
                     'name': ['Cobalt', 'Lithium', 'Graphite', 'Rare Earth']})

In [84]:
materials.to_csv('../../ddf--entities--material.csv', index=False)

In [85]:
conc1 = []
conc1_name = []

for n, i in tabs_indicator_mapping1.items():
    conc1.append(i)
    conc1_name.append(n)

In [86]:
concs2 = ['fuel_production', 'fuel_consumption', 'electricity_generation', 'production_reserve', 'installed_capacity']
concs2_name = ['Fuel Production', 'Fuel Consumption', 'Electricity Generation', 'Production Reserve', 'Installed Capacity']

In [87]:
concs = [*conc1, *concs2]
concs_name = [*conc1_name, *concs2_name]

In [88]:
concs

['primary_energy_consumption',
 'primary_energy_consumption_per_capita',
 'carbon_dioxide_emissions',
 'oil_refinery_throughput',
 'oil_refining_capacity',
 'oil_proved_reserves',
 'gas_proved_reserves',
 'electricity_generation',
 'fuel_production',
 'fuel_consumption',
 'electricity_generation',
 'production_reserve',
 'installed_capacity']

In [89]:
measures = pd.DataFrame({'concept': concs, 'name': concs_name})

In [90]:
measures['concept_type'] = 'measure'

In [91]:
measures = measures.drop_duplicates(subset='concept')

In [92]:
measures.to_csv('../../ddf--concepts--continuous.csv', index=False)

In [93]:
disc = pd.DataFrame([
    ['name', 'Name', 'string', ''],
    ['year', 'Year', 'time', ''],
    ['geo', 'Geo', 'entity_domain', ''],
    ['unit', 'Unit', 'entity_domain', ''],
    ['fuel', 'Fuel', 'entity_domain', ''],
    ['renewable', 'Renewables', 'entity_set', 'fuel'],
    ['material', 'Material', 'entity_domain', ''],
    ['domain', 'Domain', 'string', '']
], columns=['concept', 'name', 'concept_type', 'domain'])

In [94]:
disc

Unnamed: 0,concept,name,concept_type,domain
0,name,Name,string,
1,year,Year,time,
2,geo,Geo,entity_domain,
3,unit,Unit,entity_domain,
4,fuel,Fuel,entity_domain,
5,renewable,Renewables,entity_set,fuel
6,material,Material,entity_domain,
7,domain,Domain,string,


In [95]:
disc.to_csv('../../ddf--concepts--discrete.csv', index=False)