In [1]:
import pandas as pd
import numpy as np
import xlrd
import os
import re
from tqdm import tqdm

In [2]:
class DataDetailed:
    def __init__(self, data_path, out_path):
        
        # path to xlsx file
        self.data_path = data_path
        self.out_path = out_path
        
        # list of sheets
        self.sheets = [
        "Biofuels Production - Kboed",
        "Biofuels Production - Ktoe",
        "Carbon Dioxide Emissions",
        "Coal - Prices",
        "Coal - Reserves",
        "Coal Consumption - Mtoe",
        "Coal Production - Mtoe",
        "Coal Production - Tonnes",
        "Electricity Generation ",
        "Gas - Prices ",
        "Gas - Proved reserves",
        "Gas - Proved reserves history ",
        "Gas Consumption - Bcf",
        "Gas Consumption - Bcm",
        "Gas Consumption - Mtoe",
        "Gas Production - Bcf",
        "Gas Production - Bcm",
        "Gas Production - Mtoe",
        "Geo Biomass Other - Mtoe",
        "Geo Biomass Other - TWh",
        "Geothermal Capacity",
           "Hydro Consumption - Mtoe",
        "Hydro Generation - TWh",
            "Nuclear Consumption - Mtoe",
        "Nuclear Generation - TWh",
            "Oil - Proved reserves",
        "Oil - Proved reserves history",
        "Oil - Refinery throughput",
        "Oil - Refining capacity",
            "Oil - Spot crude prices",
            "Oil Consumption - Barrels",
            "Oil Consumption - Tonnes",
        "Oil Production - Barrels",
            "Oil Production - Tonnes",
            "Primary Energy Consumption",
            "Renewables - Mtoe",
        "Renewables - TWh",
            "Solar Capacity",
        "Solar Consumption - Mtoe",
        "Solar Generation - TWh",
        "Wind Capacity",
        "Wind Consumption - Mtoe",
        "Wind Generation - TWh "
        ]
        
        
        #sheets with custom skiprow argument
        self.names_custom_start_row = {
      
            "Geothermal Capacity": 3,
            "Solar Capacity": 3,
            "Wind Capacity": 3
        }
        
    def normalize_country(self, row):
        
        row['country'] = row['country'].str.replace(r'\s*[^A-Za-z\s]*$', '')
        return row
        
    # if custom is True then we use names_custom_index dict 
    def process_sheet(self, sheet_id, sh, skiprows, custom=False):

        data = pd.read_excel(self.data_path, na_values=['n/a'], 
              sheet_name=sh, 
              skiprows=skiprows)
        
        data = data.dropna(how='all')
        index_name = "Total proved reserves" if custom else data.columns[0]
        data = data.set_index(index_name)
        data = data[:'Total World'] 
        last_year = data.columns.get_loc(2018)
        data = data[data.columns[:last_year+1]]
        d = data.T.unstack().reset_index()
        d.rename(columns={d.columns[0]: "country", d.columns[1]: "year", d.columns[2]: "value"}, inplace=True)
        d = self.normalize_country(d)
        d.to_csv(self.out_path+"datapoints_%s.csv" % str(sheet_id), index=False)
        
    def process_gas_prices(self, sheet_id, sh, skiprows):

        data = pd.read_excel(self.data_path, na_values=['n/a'], 
              sheet_name=sh, 
              skiprows=skiprows)
        
        data = data.dropna(how='all')
        data = data[[data.columns[0]] + [x for x in data.columns[1:] if "Unnamed" not in x]]
        index_name = data.columns[0]
        data.loc[1:35]
        data = data.set_index(index_name)
        data = data.dropna(how='any')
        data = data.iloc[1::]
        d = data.transpose().T.unstack().reset_index()
        d.rename(columns={d.columns[0]: "country", d.columns[1]: "year", d.columns[2]: "value"}, inplace=True)
        d = self.normalize_country(d)
        d.to_csv(self.out_path+"datapoints_%s.csv" % str(sheet_id), index=False)
        
    def process_coal_prices(self, sheet_id, sh, skiprows):

        data = pd.read_excel(self.data_path, na_values=['n/a'], 
              sheet_name=sh, 
              skiprows=skiprows)
        
        data = data.dropna(how='all')
        data = data[[x for x in data.columns if "Unnamed" not in x and x != '   ']]
        data
        index_name = data.columns[0]
        data.loc[0:32]
        data = data.set_index(index_name)
        data = data.dropna(how='any').transpose().T.unstack().reset_index()
        data.rename(columns={data.columns[0]: "country", data.columns[1]: "year", data.columns[2]: "value"}, inplace=True)
        data = self.normalize_country(data)
        data.to_csv(self.out_path+"datapoints_%s.csv" % str(sheet_id), index=False)
    
    def process_proved_reserves(self, sheet_id, sh, skiprows):

        data = pd.read_excel(self.data_path, na_values=['n/a'], 
              sheet_name=sh, 
              skiprows=skiprows)
        
        data = data.dropna(how='all')
        index_name = data.columns[0]
        data = data.loc[3:]
        data = data.set_index(index_name)

        data= data[[x for x in data.columns if "at end" in x]]
        data = data[:'Total World'] 
        d = data.T.unstack().reset_index()
        d.rename(columns={d.columns[0]: "country", d.columns[1]: "year", d.columns[2]: "value"}, inplace=True)
        d = self.normalize_country(d)
        d.to_csv(self.out_path+"datapoints_%s.csv" % str(sheet_id), index=False)
        
    def process_crude_prices(self, sheet_id, sh, skiprows):

        data = pd.read_excel(self.data_path, na_values=['n/a'], 
              sheet_name=sh, 
              skiprows=skiprows)
        
        data = data.dropna(how='all')
        index_name = data.columns[0]
        data = data.loc[3:49]
        data = data.set_index(index_name)
        data= data[[x for x in data.columns if "Unnamed" not in x]]
        data = data.transpose().T.unstack().reset_index()
        data.rename(columns={data.columns[0]: "country", data.columns[1]: "year", data.columns[2]: "value"}, inplace=True)
        data = self.normalize_country(data)
        data.to_csv(self.out_path+"datapoints_%s.csv" % str(sheet_id), index=False)
        
    def process_coal_reserves(self, sh, skiprows):
        data = pd.read_excel(self.data_path, na_values=['n/a'], 
              sheet_name=sh, 
              skiprows=skiprows)
        data = data.dropna(how='all')
        data['year'] = "2018"
        data = data.loc[1:53]
        for i,ch in enumerate("567"):
            data.rename(columns={data.columns[0]: "country", data.columns[i+1]: "value"}, inplace=True)
            data = self.normalize_country(data)
            data[['country', 'year', data.columns[i+1]]].to_csv(self.out_path+"datapoints_%s.csv" % ch, index=False)
            data.rename(columns={data.columns[i+1]: "%s" % ch}, inplace=True)    

In [3]:
final = pd.read_csv('variables.csv')

In [5]:
dat = DataDetailed('bp_stats.xlsx', "csvs/")

for sh in tqdm(dat.sheets):
    if sh == "Coal - Reserves":
        dat.process_coal_reserves(sh, 3)
    else:
        id_val = final[final['name'] == sh]['id'].values[0]
        if sh == "Coal - Prices":
            dat.process_coal_prices(id_val, sh, 1)
        elif sh == "Gas - Prices ":
            dat.process_gas_prices(id_val, sh, 3)
        elif sh == "Gas - Proved reserves" or sh == "Oil - Proved reserves":
            dat.process_proved_reserves(id_val, sh, 1)
        elif sh == "Oil - Spot crude prices":
            dat.process_crude_prices(id_val, sh, 1)
        elif sh in dat.names_custom_start_row:
            dat.process_sheet(id_val,sh, dat.names_custom_start_row[sh], custom=False)
        else:
            dat.process_sheet(id_val, sh, 2, custom=False)

100%|██████████| 43/43 [01:58<00:00,  3.13s/it]
