In [1]:
import pandas as pd
import xlrd
from tqdm import tqdm

In [2]:
class Data:
    def __init__(self, data_path):
        
        # path to xlsx file
        self.data_path = data_path
        
        # list of sheets
        self.sheets = [
        "Biofuels Production - Kboed",
        "Biofuels Production - Ktoe",
        "Carbon Dioxide Emissions",
        "Coal - Prices",
        "Coal - Reserves",
        "Coal Consumption - Mtoe",
        "Coal Production - Mtoe",
        "Coal Production - Tonnes",
        "Electricity Generation ",
        "Gas - Prices ",
        "Gas - Proved reserves",
        "Gas - Proved reserves history ",
        "Gas Consumption - Bcf",
        "Gas Consumption - Bcm",
        "Gas Consumption - Mtoe",
        "Gas Production - Bcf",
        "Gas Production - Bcm",
        "Gas Production - Mtoe",
        "Geo Biomass Other - Mtoe",
        "Geo Biomass Other - TWh",
        "Geothermal Capacity",
           "Hydro Consumption - Mtoe",
        "Hydro Generation - TWh",
            "Nuclear Consumption - Mtoe",
        "Nuclear Generation - TWh",
            "Oil - Proved reserves",
        "Oil - Proved reserves history",
        "Oil - Refinery throughput",
        "Oil - Refining capacity",
            "Oil - Spot crude prices",
            "Oil Consumption - Barrels",
            "Oil Consumption - Tonnes",
        "Oil Production - Barrels",
            "Oil Production - Tonnes",
            "Primary Energy Consumption",
            "Renewables - Mtoe",
        "Renewables - TWh",
            "Solar Capacity",
        "Solar Consumption - Mtoe",
        "Solar Generation - TWh",
        "Wind Capacity",
        "Wind Consumption - Mtoe",
        "Wind Generation - TWh "
        ]
        
        # counter for ids
        self.counter = 1

        self.names, self.units, self.notes, self.ids = [], [], [], []
        
        #sheets with custom skiprow argument
        self.names_custom_start_row = {
            "Coal - Prices": 1,
            "Coal - Reserves": 3,
            "Gas - Prices ": 1,
            "Geothermal Capacity": 3,
            "Oil - Spot crude prices": 3,
            "Solar Capacity": 3,
            "Wind Capacity": 3
        }
        
        #sheets with custom index column
        self.names_custom_index = {
            "Gas - Proved reserves": "Trillion cubic metres",
            "Oil - Proved reserves": "Thousand million barrels"
        }
        
    # if custom is True then we use names_custom_index dict 
    def process_sheet(self, sh, skiprows, custom=False):

        data = pd.read_excel(self.data_path, na_values=['n/a'], 
              sheet_name=sh, 
              skiprows=skiprows)
        unit = "Total proved reserves" if custom else data.columns[0]
        data.fillna("none", inplace=True)
        try:
            startLoc = data[data[unit].str.contains(('Notes:|Note:'), na=False)].index.values[0]
            note = " ".join(data.loc[startLoc:][unit].values)
        except:
            note = ""

        unit_to_add = self.names_custom_index[sh] if custom else unit

        self.names.append(sh)
        self.units.append(unit_to_add)
        self.notes.append(note)
        self.ids.append(self.counter)
        self.counter += 1

In [None]:
dat = Data('bp_stats.xlsx')

for sh in tqdm(dat.sheets):
    if sh in dat.names_custom_start_row:
        dat.process_sheet(sh, dat.names_custom_start_row[sh], custom=False)
    elif sh in dat.names_custom_index:
        dat.process_sheet(sh, 1, custom=True)
    else:
        dat.process_sheet(sh, 2, custom=False)

 86%|████████▌ | 37/43 [01:21<00:13,  2.17s/it]

In [None]:
final = pd.DataFrame()
final['id'] = dat.ids
final['name'] = dat.names
final['unit'] = dat.units
final['notes'] = dat.notes

In [None]:
final.to_csv('variables.csv', index=False)