In [228]:
import pandas as pd
import xlrd 
import os
from ddf_utils.str import to_concept_id, format_float_digits

In [30]:
from functools import partial

In [31]:
sheet_loader = partial(pd.read_excel, skiprows=2)

In [229]:
number_formatter = partial(format_float_digits, digits=8)

In [2]:
source_file = '../source/bp-stats-review-2018-all-data.xlsx'

In [3]:
sheets = xlrd.open_workbook(source_file, on_demand=True).sheet_names()

In [4]:
sheets

['Contents',
 'Primary Energy Consumption',
 'Primary Energy - Cons by fuel',
 'Oil - Proved reserves',
 'Oil - Proved reserves history',
 'Oil Production - Barrels',
 'Oil Production - Tonnes',
 'Oil Consumption - Barrels',
 'Oil Consumption - Tonnes',
 'OIl Consumption - Mtoe',
 'Oil - Regional Consumption ',
 'Oil - Spot crude prices',
 'Oil - Crude prices since 1861',
 'Oil - Refinery throughput',
 'Oil - Refining capacity',
 'Oil - Regional refining margins',
 'Oil - Trade movements',
 'Oil - Inter-area movements ',
 'Oil - Trade 2016- 2017',
 'Gas - Proved reserves',
 'Gas - Proved reserves history ',
 'Gas Production - Bcm',
 'Gas Production - Bcf',
 'Gas Production - Mtoe',
 'Gas Consumption - Bcm',
 'Gas Consumption - Bcf',
 'Gas Consumption - Mtoe',
 'Gas - Trade - pipeline',
 'Gas - Trade movements LNG',
 'Gas - Trade 2016-2017',
 'Gas - Prices ',
 'Coal - Reserves',
 'Coal - Prices',
 'Coal Production - Tonnes',
 'Coal Production - Mtoe',
 'Coal Consumption - Mtoe',
 'Nucle

In [6]:
contents = pd.read_excel(source_file, sheet_name='Contents', header=None)

In [7]:
contents

Unnamed: 0,0
0,BP Statistical Review of World Energy June 2018
1,This workbook contains information presented ...
2,"BP Statistical Review of World Energy, which ..."
3,internet at:
4,http://www.bp.com/statisticalreview
5,Please use the contents or the tabs at the bot...
6,Primary Energy: Consumption - Mtoe (from 1965)
7,Primary Energy: Consumption by fuel type - Mto...
8,Oil: Proved reserves
9,Oil: Proved reserves - Barrels (from 1980)


In [15]:
tabs_to_parse = contents.loc[6:69, 0]

In [16]:
tabs_to_parse

6        Primary Energy: Consumption - Mtoe (from 1965)
7     Primary Energy: Consumption by fuel type - Mto...
8                                  Oil: Proved reserves
9            Oil: Proved reserves - Barrels (from 1980)
10                Oil: Production - Barrels (from 1965)
11                 Oil: Production - Tonnes (from 1965)
12               Oil: Consumption - Barrels (from 1965)
13                Oil: Consumption - Tonnes (from 1965)
14                  Oil: Consumption - Mtoe (from 1965)
15    Oil: Regional consumption - by product - Barre...
16                              Oil: Spot crude prices 
17                         Oil: Crude prices since 1861
18                 Oil: Refinery throughput (from 1980)
19                   Oil: Refining capacity (from 1965)
20           Oil: Regional refining margins (from 1992)
21                     Oil: Trade movements (from 1980)
22                           Oil: Inter-area movements 
23                                 Oil: Trade 20

In [17]:
len(sheets[1:-2])

64

In [22]:
tabs = pd.DataFrame({'full_name': tabs_to_parse, 'tab_name': sheets[1:-2]})

In [23]:
tabs

Unnamed: 0,full_name,tab_name
6,Primary Energy: Consumption - Mtoe (from 1965),Primary Energy Consumption
7,Primary Energy: Consumption by fuel type - Mto...,Primary Energy - Cons by fuel
8,Oil: Proved reserves,Oil - Proved reserves
9,Oil: Proved reserves - Barrels (from 1980),Oil - Proved reserves history
10,Oil: Production - Barrels (from 1965),Oil Production - Barrels
11,Oil: Production - Tonnes (from 1965),Oil Production - Tonnes
12,Oil: Consumption - Barrels (from 1965),Oil Consumption - Barrels
13,Oil: Consumption - Tonnes (from 1965),Oil Consumption - Tonnes
14,Oil: Consumption - Mtoe (from 1965),OIl Consumption - Mtoe
15,Oil: Regional consumption - by product - Barre...,Oil - Regional Consumption


In [42]:
tabs.to_csv('tabs.csv', index=False)

In [24]:
# Plan:
# production, consumption: create indicators with country/year/fuel/unit dimension (because they have different units)

In [220]:
def preprocess(data):
    """preprocessing the data:
    1. rename the first column to geo_name
    2. rename the geo_name to alphanumeric
    3. drop all empty lines and lines after 'total world'

    Note: This function only applies to the tab with country as row index
    and year as column index.
    """
    data = data.rename(columns={data.columns[0]: 'geo_name'})
    data['geo'] = data['geo_name'].map(to_concept_id)
    data = data.set_index('geo')
    data = data.dropna(how='all')
    data = data.loc[:'total_world']
    data = data.reset_index()
    return data

there are a few formats in the excel, we will create handler for each format

```
indicators with only geo, time dimension:

Primary Energy: Consumption - Mtoe (from 1965),Primary Energy Consumption
Carbon Dioxide Emissions (from 1965),Carbon Dioxide Emissions
Oil: Refinery throughput (from 1980),Oil - Refinery throughput
Oil: Refining capacity (from 1965),Oil - Refining capacity
Electricity generation - TWh (from 1985),Electricity Generation

geo/time/fuel, indicators with unit: 

Oil: Production - Barrels (from 1965),Oil Production - Barrels
Oil: Production - Tonnes (from 1965),Oil Production - Tonnes
Oil: Consumption - Barrels (from 1965),Oil Consumption - Barrels
Oil: Consumption - Tonnes (from 1965),Oil Consumption - Tonnes
Oil: Consumption - Mtoe (from 1965),OIl Consumption - Mtoe
Oil: Refinery throughput (from 1980),Oil - Refinery throughput
Oil: Refining capacity (from 1965),Oil - Refining capacity
Gas: Proved reserves - Bcm (from 1980),Gas - Proved reserves history 
Gas: Production - Bcm (from 1970),Gas Production - Bcm
Gas: Production - Bcf (from 1970),Gas Production - Bcf
Gas: Production - Mtoe (from 1970),Gas Production - Mtoe
Gas: Consumption - Bcm (from 1965),Gas Consumption - Bcm
Gas: Consumption - Bcf (from 1965),Gas Consumption - Bcf
Gas: Consumption - Mtoe (from 1965),Gas Consumption - Mtoe
Coal: Production - Tonnes (from 1981),Coal Production - Tonnes
Coal: Production - Mtoe (from 1981),Coal Production - Mtoe
Coal: Consumption - Mtoe (from 1965),Coal Consumption - Mtoe
Nuclear Energy - Generation - TWh (from 1965),Nuclear Generation - TWh
Nuclear Energy - Consumption - Mtoe (from 1965),Nuclear Consumption - Mtoe
Hydroelectricity - Generation - TWh (from 1965),Hydro Generation - TWh
Hydroelectricity - Consumption - Mtoe (from 1965),Hydro Consumption - Mtoe
Renewables - Other renewables generation -Twh (from 1965),Other renewables - TWh
Renewables - Other renewables consumption - Mtoe (from 1965),Other renewables - Mtoe
Renewables - Solar generation - TWh (from 1965),Solar Consumption - TWh
Renewables - Solar consumption - Mtoe (from 1965),Solar Consumption - Mtoe
Renewables - Wind generation - TWh (from 1965),Wind Consumption - TWh 
Renewables - Wind consumption - Mtoe (from 1965),Wind Consumption - Mtoe
"Renewables - Geothermal, Biomass and Other generation - TWh  (from 1965)",Geo Biomass Other - TWh
"Renewables - Geothermal, Biomass and Other - Mtoe  (from 1965)",Geo Biomass Other - Mtoe
Renewables - Biofuels production - Kboe/d (from 1990),Biofuels Production - Kboed
Renewables - Biofuels production - Ktoe (from 1990),Biofuels Production - Ktoe

Electricity generation from oil -TWh (from 1985),Elec Gen from Oil
Electricity generation from gas - TWh (from 1985),Elec Gen from Gas
Electricity generation from coal - TWh (from 1985),Elec Gen from Coal
Electricity generation from other - TWh (from 1985),Elec Gen from Other

Production - Reserves by key materials:

Key materials - Cobalt Production - Reserves (from 1995),Cobalt Production-Reserves
Key materials - Lithium Production - Reserves (from 1995),Lithium Production-Reserves
Key materials - Graphite Production - Reserves (from 1995),Graphite Production-Reserves
Key materials - Rare Earth Production - Reserves (from 1995),Rare Earth Production-Reserves
Key materials - Cobalt and Lithium Prices,Cobalt and Lithium - Prices

Regional consumption - by product:

Oil: Regional consumption - by product - Barrels (from 1965),Oil - Regional Consumption 


history prices:

Oil: Spot crude prices ,Oil - Spot crude prices
Oil: Crude prices since 1861,Oil - Crude prices since 1861
Gas: Prices ,Gas - Prices 
Coal: Prices,Coal - Prices


class 4:

Oil: Regional refining margins (from 1992),Oil - Regional refining margins

class 5:

Oil: Trade movements (from 1980),Oil - Trade movements
Oil: Inter-area movements ,Oil - Inter-area movements
Oil: Trade 2016-2017,Oil - Trade 2016- 2017
Gas: Trade movements pipeline,Gas - Trade - pipeline
Gas: Trade movements LNG,Gas - Trade movements LNG
Gas: Trade 2016-2017,Gas - Trade 2016-2017


```

In [230]:
def process_1(data, ddf_id):
    data = data.dropna(axis=1, how='all')
    data = data.drop('geo_name', axis=1)
    idx = list(data.columns).index(2017)
    data = data.iloc[:, :idx + 1]  # drop columns after latest year of each sheet.
    # data = data.drop(['2017.1', '2017.2', '2006-16'], axis=1)
    
    data = data.set_index('geo')

    d = data.T.unstack()
    d = d.dropna()
    d = d.reset_index()
    d.columns = ['geo', 'year', ddf_id]
    d[ddf_id] = d[ddf_id].map(number_formatter)

    return d.sort_values(by=['geo', 'year'])

In [29]:
tabs.iloc[0]

full_name    Primary Energy: Consumption - Mtoe (from 1965)
tab_name                         Primary Energy Consumption
Name: 6, dtype: object

In [115]:
data1 = sheet_loader(source_file, sheet_name='Primary Energy Consumption')

In [116]:
data1 = preprocess(data1)

In [117]:
data1.head()

Unnamed: 0,geo,geo_name,1965,1966,1967,1968,1969,1970,1971,1972,...,2011,2012,2013,2014,2015,2016,2017,2017.1,2006-16,2017.2
0,us,US,1249.642424,1320.03795,1365.733676,1448.932114,1526.191928,1577.850993,1610.270681,1692.88818,...,2216.708138,2161.0065,2221.071059,2246.190385,2226.972734,2228.02397,2234.851921,0.005813,-0.002554251,0.165408
1,canada,Canada,115.910849,122.991657,129.026541,137.695457,145.303023,155.587992,159.794037,170.69824,...,323.181437,319.914365,331.533513,335.370963,331.134014,338.958111,348.690219,0.03153,0.009036799,0.025808
2,mexico,Mexico,24.972964,26.432788,26.922222,29.291127,32.196346,34.320386,36.094722,39.916724,...,183.054676,184.181945,185.672286,185.248667,181.560619,194.884327,189.252719,-0.026237,0.01597613,0.014007
3,total_north_america,Total North America,1390.526237,1469.462395,1521.68244,1615.918698,1703.691297,1767.759372,1806.15944,1903.503145,...,2722.944251,2665.10281,2738.276859,2766.810015,2739.667367,2761.866408,2772.794859,0.006707,3.720036e-07,0.205222
4,argentina,Argentina,26.914442,27.809713,28.635373,29.623572,30.644712,28.937326,30.747979,31.299226,...,78.937055,81.210041,84.479924,84.559039,86.423654,86.556688,85.904038,-0.004821,0.02124238,0.006358


In [119]:
process_1(data1, to_concept_id('Primary Energy: Consumption - Mtoe')).head()

Unnamed: 0,geo,year,primary_energy_consumption_mtoe
3382,algeria,1965,2.122521
3383,algeria,1966,2.556875
3384,algeria,1967,2.416595
3385,algeria,1968,2.592053
3386,algeria,1969,2.874065


In [120]:
df = process_1(data1, to_concept_id('Primary Energy: Consumption - Mtoe'))

In [121]:
df.to_csv('../../ddf--datapoints--primary_energy_consumption_mtoe--by--geo--year.csv', index=False)

In [308]:
countries = []

In [245]:
tabs_indicator_mapping1 = {
    'Primary Energy Consumption': to_concept_id('Primary Energy Consumption'),
    'Carbon Dioxide Emissions': to_concept_id('Carbon Dioxide Emissions'),
    'Oil - Refinery throughput': to_concept_id('Oil - Refinery throughput'),
    'Oil - Refining capacity': to_concept_id('Oil - Refining capacity'),
    'Oil - Proved reserves history': to_concept_id('Oil - Proved reserves'),
    'Gas - Proved reserves history ': to_concept_id('Gas - Proved reserves'),
    'Electricity Generation ': to_concept_id('Electricity Generation')
}

In [246]:
for t, indicator in tabs_indicator_mapping1.items():
    data = sheet_loader(source_file, sheet_name=t)
    data = preprocess(data)
    countries.append(data['geo_name'].unique())
    df = process_1(data, indicator)
    df.to_csv('../../ddf--datapoints--{}--by--geo--year.csv'.format(indicator), index=False)

In [233]:
def make_dict1(sheet, unit, fuel):
    return dict(sheet=sheet, unit=unit, fuel=fuel)

In [431]:
tabs_indicator_mapping2 = {
    'fuel_production': [
        make_dict1(sheet='Gas Production - Bcm', unit='bcm', fuel='gas'),
        make_dict1(sheet='Gas Production - Bcf', unit='bcf', fuel='gas'),
        make_dict1('Gas Production - Mtoe', 'mtoe', 'gas'),
        make_dict1('Coal Production - Tonnes', 'tonne', 'coal'),
        make_dict1('Coal Production - Mtoe', 'mtoe', 'coal'),
        make_dict1('Biofuels Production - Kboed', 'kboed', 'biofuel'),
        make_dict1('Biofuels Production - Ktoe', 'ktoe', 'biofuel'),
        make_dict1('Oil Production - Barrels', 'barrel', 'oil'),
        make_dict1('Oil Production - Tonnes', 'tonne', 'oil')
    ],
    'fuel_consumption': [
        make_dict1('Gas Consumption - Bcm', 'bcm', 'gas'),
        make_dict1('Gas Consumption - Bcf', 'bcf', 'gas'),
        make_dict1('Gas Consumption - Mtoe', 'mtoe', 'gas'),
        make_dict1('Coal Consumption - Mtoe', 'mtoe', 'coal'),
        make_dict1('Nuclear Consumption - Mtoe', 'mtoe', 'nuclear'),
        make_dict1('Hydro Consumption - Mtoe', 'mtoe', 'hydro'),
        make_dict1('Other renewables - Mtoe', 'mtoe', 'other_renewables'),
        make_dict1('Solar Consumption - Mtoe', 'mtoe', 'solar'),
        make_dict1('Wind Consumption - Mtoe', 'mtoe', 'wind'),
        make_dict1('Geo Biomass Other - Mtoe', 'mtoe', 'geo_biomass'),
        make_dict1('Oil Consumption - Barrels', 'barrel', 'oil'),
        make_dict1('Oil Consumption - Tonnes', 'tonne', 'oil'),
        make_dict1('OIl Consumption - Mtoe', 'mtoe', 'oil')
    ],
    'electricity_generation':[
        make_dict1('Elec Gen from Oil', 'twh', 'oil'),
        make_dict1('Elec Gen from Gas', 'twh', 'gas'),
        make_dict1('Elec Gen from Coal', 'twh', 'coal'),
        make_dict1('Elec Gen from Other', 'twh', 'other'),
        make_dict1('Nuclear Generation - TWh', 'twh', 'nuclear'),
        make_dict1('Hydro Generation - TWh', 'twh', 'hydro'),
        make_dict1('Other renewables - TWh', 'twh', 'other_renewables'),
        make_dict1('Solar Consumption - TWh', 'twh', 'solar'),
        make_dict1('Wind Consumption - TWh ', 'twh', 'wind'),
        make_dict1('Geo Biomass Other - TWh', 'twh', 'geo_biomass')
    ]
}

In [433]:
for i, ms in tabs_indicator_mapping2.items():
    data = []
    for m in ms:
        d = sheet_loader(source_file, sheet_name=m['sheet'])
        d = preprocess(d)
        #countries.append(d['geo_name'].unique())
        d = process_1(d, i)
        d['fuel'] = m['fuel']
        d['unit'] = m['unit']
        data.append(d)
    data = pd.concat(data, ignore_index=True)
    df = data[['geo', 'fuel', 'year', 'unit', i]]
    df[i] = df[i].map(number_formatter)
    df = df.sort_values(by=['fuel', 'unit', 'geo', 'year'])
    df.to_csv('../../ddf--datapoints--{}--by--geo--fuel--year--unit.csv'.format(i), index=False)

In [249]:
import numpy as np

In [310]:
countries = np.concatenate(countries)

In [311]:
c1 = pd.DataFrame({'name': countries})

In [312]:
c1['name'] = c1['name'].str.strip()

In [313]:
c1 = c1.drop_duplicates(subset='name')

In [314]:
c1['geo'] = c1['name'].map(to_concept_id)

In [315]:
c1 = c1[['geo', 'name']].sort_values(by='geo')

In [316]:
c1

Unnamed: 0,geo,name
44,algeria,Algeria
398,angola,Angola
4,argentina,Argentina
50,australia,Australia
297,austria,Austria
23,azerbaijan,Azerbaijan
32,bahrain,Bahrain
51,bangladesh,Bangladesh
537,belarus,Belarus
298,belgium,Belgium


In [101]:
!open ../../

In [186]:
# Reserves by key materials

In [199]:
def preprocess_2(data):
    """preprocessing the data:
    1. rename the first column to geo_name
    2. rename the geo_name to alphanumeric
    3. drop all empty lines and lines after 'total world'

    Note: This function only applies to the tab with country as row index
    and year as column index.
    """
    data = data.rename(columns={data.columns[0]: 'geo_name'})
    data['geo'] = data['geo_name'].map(to_concept_id)
    data = data.set_index('geo')
    data = data.dropna(how='all')
    if 'world' in data.index:
        data = data.loc[:'world']
    else:
        data = data.loc[:'total_world']
    data = data.reset_index()
    return data

def process_2(data, ddf_id):
    data = data.dropna(axis=1, how='all')
    data = data.drop('geo_name', axis=1)
    idx = list(data.columns).index(2017)
    data = data.iloc[:, :idx + 1]  # drop columns after latest year of each sheet.
    # data = data.drop(['2017.1', '2017.2', '2006-16'], axis=1)
    
    data = data.set_index('geo')

    d = data.T.unstack()
    d = d.dropna()
    d = d.reset_index()
    d.columns = ['geo', 'year', ddf_id]

    return d.sort_values(by=['geo', 'year'])

In [295]:
countries2 = []

In [396]:
def create_datapoint_2(data_tabs, fuels, indicator):
    data = []

    for t, f in zip(data_tabs, fuels):
        d = sheet_loader(source_file, sheet_name=t)
        d = preprocess_2(d)
        countries2.append(d['geo_name'].unique())
        d = process_2(d, indicator)
        d['material'] = f
        data.append(d)
    data = pd.concat(data, ignore_index=True)
    df = data[['geo', 'material', 'year', indicator]]
    df[indicator] = df[indicator].map(number_formatter)
    # df.to_csv('../../ddf--datapoints--{}--by--geo--mineral--year.csv'.format(indicator), index=False)
    
    return df

In [424]:
data_tabs = ['Cobalt Production-Reserves', 
             'Lithium Production-Reserves',
             'Graphite Production-Reserves',
             'Rare Earth Production-Reserves'
            ]
fuels  = ['cobalt', 'lithium', 'graphite', 'rare_earth']
indicator = 'production_reserve'

In [425]:
df = create_datapoint_2(data_tabs, fuels, indicator)

In [426]:
df.loc[df.geo == 'brazil1', 'geo'] = 'brazil'
df.loc[df.geo == 'india2', 'geo'] = 'india'

In [427]:
df.loc[df.geo == 'world', 'geo'] = 'total_world'

In [428]:
df = df[~df.geo.isin(['rest_of_world'])]

In [429]:
df.sample(10)

Unnamed: 0,geo,material,year,production_reserve
424,china,lithium,2005,2.82
136,morocco,cobalt,2016,2.72986882
888,brazil,rare_earth,2009,0.17
864,australia,rare_earth,2008,0.0
243,russian_federation,cobalt,2008,6.2
1008,russian_federation,rare_earth,2014,2.1338
753,russian_federation,graphite,2012,14.0
171,papua_new_guinea,cobalt,2005,0.0
551,zimbabwe,lithium,2017,1.0
954,malaysia,rare_earth,2006,0.43


In [303]:
df.geo.unique() 

array(['australia', 'canada', 'cuba', 'democratic_republic_of_congo',
       'madagascar', 'morocco', 'new_caledonia', 'papua_new_guinea',
       'philippines', 'russian_federation', 'south_africa', 'total_world',
       'zambia', 'argentina', 'brazil', 'chile', 'china', 'portugal',
       'us', 'zimbabwe', 'india', 'mexico', 'mozambique', 'sri_lanka',
       'ukraine', 'malaysia', 'thailand'], dtype=object)

In [430]:
(df.sort_values(by=['material', 'geo', 'year'])
 .to_csv('../../ddf--datapoints--production_reserve--by--geo--material--year.csv', index=False))

In [375]:
sheet_loader2 = partial(pd.read_excel, skiprows=3)

In [406]:
countries3 = []

In [407]:
def create_datapoint_3(data_tabs, fuels, indicator):
    data = []

    for t, f in zip(data_tabs, fuels):
        d = sheet_loader2(source_file, sheet_name=t)
        d = preprocess_2(d)
        countries3.append(d['geo_name'].unique())
        d = process_2(d, indicator)
        d['renewable'] = f
        data.append(d)
    data = pd.concat(data, ignore_index=True)
    df = data[['geo', 'renewable', 'year', indicator]]
    df[indicator] = df[indicator].map(number_formatter)
    # df.to_csv('../../ddf--datapoints--{}--by--geo--mineral--year.csv'.format(indicator), index=False)
    
    return df

In [408]:
data_tabs = ['Geothermal Capacity', 
             'Solar Capacity',
             'Wind Capacity'
            ]
fuels  = ['geothermal', 'solar', 'wind']
indicator = 'installed_capacity'

In [409]:
df = create_datapoint_3(data_tabs, fuels, indicator)

In [379]:
df.sample(10)

Unnamed: 0,geo,renewable,year,installed_capacity
2742,total_africa,wind,2004,232.46
1759,ukraine,solar,2010,2.5
2576,portugal,wind,2006,1716.0
388,new_zealand,geothermal,2010,723.425
42,chile,geothermal,2000,0.0
303,italy,geothermal,2005,790.5
412,nicaragua,geothermal,2014,159.5
2624,south_africa,wind,2012,10.16
1341,portugal,solar,2010,134.0
1913,brazil,wind,2015,7633.0


In [413]:
c3 = np.concatenate(countries3)

In [414]:
c3 = pd.DataFrame({'name': c3})

In [415]:
c3['name'] = c3['name'].str.strip()

In [417]:
c3 = c3.drop_duplicates(subset='name')
c3['geo'] = c3['name'].map(to_concept_id)

In [422]:
print(c3[~c3.geo.isin(c1.geo)][['geo', 'name']].to_csv(index=False))

geo,name
costa_rica,Costa Rica
el_salvador,El Salvador
ethiopia,Ethiopia
guadeloupe,Guadeloupe
guatemala,Guatemala
honduras,Honduras
kenya,Kenya
nicaragua,Nicaragua
papua_new_guinea,Papua New Guinea
russia,Russia
jordan,Jordan
uruguay,Uruguay



In [381]:
(df.sort_values(by=['renewable', 'geo', 'year'])
 .to_csv('../../ddf--datapoints--{}--by--geo--renewable--year.csv'.format(indicator), index=False))

In [280]:
countries2 = np.concatenate(countries2)

In [281]:
c2 = pd.DataFrame({'name': countries2})

In [282]:
c2['name'] = c2['name'].str.strip()

In [283]:
c2 = c2.drop_duplicates(subset='name')

In [285]:
c2['geo'] = c2['name'].map(to_concept_id)

In [286]:
c2[~c2.geo.isin(c1.geo)]

Unnamed: 0,name,geo
2,Democratic Republic of Congo,democratic_republic_of_congo
3,Cuba,cuba
4,Madagascar,madagascar
6,New Caledonia,new_caledonia
11,Zambia,zambia
12,Rest of World,rest_of_world
13,World,world
24,Brazil1,brazil1
27,India2,india2
30,Mozambique,mozambique


In [320]:
units = ['Bcm', 'Bcf', 'Mtoe', 'Barrel', 'Tonne', 'TWh', 'Kboed', 'Ktoe']

In [321]:
units_df = pd.DataFrame({'unit': list(map(to_concept_id, units)), 'name': units})

In [323]:
units_df

Unnamed: 0,unit,name
0,bcm,Bcm
1,bcf,Bcf
2,mtoe,Mtoe
3,barrel,Barrel
4,tonne,Tonne
5,twh,TWh
6,kboed,Kboed
7,ktoe,Ktoe


In [324]:
units_df.to_csv('../../ddf--entities--unit.csv', index=False)

In [423]:
!cp countries.csv ../../ddf--entities--geo.csv

In [405]:
!cp fuel.csv ../../ddf--entities--fuel.csv

In [384]:
materials = pd.DataFrame({'material': ['cobalt', 'lithium', 'graphite', 'rare_earth'], 
                     'name': ['Cobalt', 'Lithium', 'Graphite', 'Rare Earth']})

In [385]:
materials.to_csv('../../ddf--entities--material.csv', index=False)

In [334]:
conc1 = []
conc1_name = []

for n, i in tabs_indicator_mapping1.items():
    conc1.append(i)
    conc1_name.append(n)

In [386]:
concs2 = ['fuel_production', 'fuel_consumption', 'electricity_generation', 'production_reserve', 'installed_capacity']
concs2_name = ['Fuel Production', 'Fuel Consumption', 'Electricity Generation', 'Production Reserve', 'Installed Capacity']

In [336]:
concs1

['primary_energy_consumption',
 'carbon_dioxide_emissions',
 'oil_refinery_throughput',
 'oil_refining_capacity',
 'oil_proved_reserves',
 'gas_proved_reserves',
 'electricity_generation']

In [387]:
concs = [*conc1, *concs2]
concs_name = [*conc1_name, *concs2_name]

In [388]:
concs

['primary_energy_consumption',
 'carbon_dioxide_emissions',
 'oil_refinery_throughput',
 'oil_refining_capacity',
 'oil_proved_reserves',
 'gas_proved_reserves',
 'electricity_generation',
 'fuel_production',
 'fuel_consumption',
 'electricity_generation',
 'production_reserve',
 'installed_capacity']

In [389]:
measures = pd.DataFrame({'concept': concs, 'name': concs_name})

In [390]:
measures['concept_type'] = 'measure'

In [391]:
measures = measures.drop_duplicates(subset='concept')

In [392]:
measures.to_csv('../../ddf--concepts--continuous.csv', index=False)

In [393]:
disc = pd.DataFrame([
    ['name', 'Name', 'string', ''],
    ['year', 'Year', 'time', ''],
    ['geo', 'Geo', 'entity_domain', ''],
    ['unit', 'Unit', 'entity_domain', ''],
    ['fuel', 'Fuel', 'entity_domain', ''],
    ['renewable', 'Renewables', 'entity_set', 'fuel'],
    ['material', 'Material', 'entity_domain', ''],
    ['domain', 'Domain', 'string', '']
], columns=['concept', 'name', 'concept_type', 'domain'])

In [394]:
disc

Unnamed: 0,concept,name,concept_type,domain
0,name,Name,string,
1,year,Year,time,
2,geo,Geo,entity_domain,
3,unit,Unit,entity_domain,
4,fuel,Fuel,entity_domain,
5,renewable,Renewables,entity_set,fuel
6,material,Material,entity_domain,
7,domain,Domain,string,


In [395]:
disc.to_csv('../../ddf--concepts--discrete.csv', index=False)