# ENGSCI 700 Geothermal Reservoir Optimisation

This workbook is for extracting Contact well data and recreating the plots.

(Unix) launch with `cd src` >`jupyter notebook`

File structure: 
```
(root)
├── src
│    └── Python Test.ipynb
└── wairakei_data
     └── Liquid wells (version 1).xlsx
     └── short version Generation Projection 2016.xlsx
```

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from IPython.display import display, HTML
import os
import pyjags
import warnings
%matplotlib inline
pyjags.load_module('glm')

base_year = '2000'    # numeric dates calculated from Jan-01
configpath = '../wairakei_data/config.xlsx'

def read_binary_solution(path='../wairakei_data/toy-network-v4.xlsm'):
    # read from Vida's toy model workbook
    xlfile = pd.ExcelFile(path)
    sheet = xlfile.parse('Full LP')
    sheet = sheet.loc[sheet.count(1)>50]  # arbitrary, anything works
    sheet = sheet.transpose()
    sheet.columns = ['used', 'combination']
    combinations = pd.DataFrame([x.split('-') for x in sheet.query('used==1')['combination']],
                               columns = ['well', 'fp'])
    combinations['well'] = 'wk' + combinations['well']
    combinations['fp'] = 'fp' + combinations['fp']
    return(combinations)

class MyModel(pyjags.Model):
    """
    Create my own model child class that doesn't ValueError on unused variables
    """
    def _init_compile(self, data, generate_data):
        if data is None:
            data = {}
        data = pyjags.model.dict_to_jags(data)
        unused = set(data.keys()) - set(self.variables)
        if unused:
#             warnings.warn('Unused data for variables: {}'.format(','.join(unused)))
            pass
        self.console.compile(data, self.chains, generate_data)

def dt2num(my_datetime):
    # returns days since base_year-01-01.
    try:
        date_numeric = (my_datetime - datetime(int(base_year),1,1)) / timedelta(days=1)   # datetime implem
    except:
        date_numeric = (my_datetime - np.datetime64(base_year)) / np.timedelta64(1, 'D')  # numpy implem
    return date_numeric

def myprint(df):
    display(HTML(df.to_html()))
    
def central(data, m=3.29):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

# Check if Excel file is already in memory (loading is slow)
try:    xl
except: xl = pd.ExcelFile('../wairakei_data/Liquid wells (version 1).xlsx')
sheetlist = [x for x in xl.sheet_names if set(x) & set('FtT(L') == set()]
print("Sheets:", ', '.join(sheetlist))

## Prepare data

In [None]:
# sheets to load data from
sheets = ['WK27 curve', 'wk247', 'w253', 'w254', 'w258', 'w259', 'w267', 'w268', 'wk255', 'wk256', 'w269', 'WK270', 'WK271', 'WK272']
sheets = sheetlist

dfs = []
for sheet in sheets:
    try:
        df = xl.parse(sheet)                                       # select well data
        df['well'] = sheet                                            # label data with well name
        dfs.append(df)
    except:
        print(f'Failed on sheet {sheet}')
df = pd.concat(dfs)

df = df[['date', 'whp', 'mf', 'h', 'well']]                      # only keep certain columns
df['well'] = df['well'].str.lower()                              # remove 'WK' inconsistencies
df['well'] = df['well'].str.replace("^[^\d]*", "wk")
df['well'] = df['well'].str.strip()
df['mf'] = pd.to_numeric(df['mf'], errors='coerce')              # remove 'dummy' entries
df = df.dropna(subset=['date', 'whp', 'mf'])                     # remove NA

df['date_numeric'] = dt2num(df['date']) #  yrs since base_year
regression_df = df.reset_index(drop=True)
wells = regression_df['well'].unique()
print(wells)

## Introduce Flash Plants

In [None]:
# import and process data
try:    fpxl
except: fpxl = pd.ExcelFile('../wairakei_data/Data for AU.xlsx')
    
fpdf = pd.read_excel(fpxl, 'calculation', header=1, usecols="D:E, J:L, N:P")
fpdf = fpdf.rename(columns={"FP15": "well", "Unnamed: 1": "fp",
                            "hf": "hf_ip", "hg": "hg_ip", "hfg": "hfg_ip",
                            "hf.1": "hf_lp", "hg.1": "hg_lp", "hfg.1": "hfg_lp"})
fpdf = fpdf[pd.to_numeric(fpdf['hf_ip'], errors='coerce').notnull()]  # make sure it has the necessary data
for col in ['well', 'fp']:
    fpdf[col] = fpdf[col].str.lower()
fpdf[fpdf.columns] = fpdf[fpdf.columns].apply(pd.to_numeric, errors='ignore')
fpdf.head()

In [None]:
def write_config(configpath):
    # only use if it gets lost. Will refresh file
    well_fp_map = pd.DataFrame({'well': ['wk27', 'wk242', 'wk247', 'wk253', 'wk254', 'wk255', 'wk256', 'wk258', 'wk259', 'wk267', 'wk268', 'wk269', 'wk270', 'wk271', 'wk272'],
                                'fp':   ['fp1',  'fp14',  'fp15', 'fp16',  'fp16',  'fp15',  'fp15',  'fp16',  'fp16',  'fp16',  'fp16',  'fp15',  'fp15',  'fp14',  'fp14']},
                               columns=['well', 'fp'])
    fp_gen_map = pd.DataFrame({'fp':     ['abandoned', 'poi dry', 'direct ip', 'fp1', 'fp14', 'fp15', 'fp16', 'fp2', 'fp4', 'fp5', 'fp9-10'],
                               'gen_ip': [ None,       'POI',      None,       'WRK', 'WRK',  'THI',  'POI',  'WRK', 'WRK', 'WRK', 'WRK'   ],
                               'gen_lp': [ None,       'POI',      None,       'WRK', 'WRK',  'THI',  'POI',  'WRK', 'WRK', 'WRK', 'WRK'   ],
                               'gen_w':  [ None,        None,      None,       'BIN',  None,   None,   None,  'BIN', 'BIN', 'BIN', 'BIN'   ]},
                              columns=['fp', 'gen_ip', 'gen_lp', 'gen_w'])
    gen_constants = pd.DataFrame({'gen':    ['WRK', 'THI', 'BIN', 'POI' ],
                                  'ip':     [ True,  True,  False, True ],
                                  'lp':     [ True,  True,  False, True ],
                                  'bin':    [ False, False, True,  False],
                                  'factor': [ 9.2,   8.22,  178.9, 7.76]},  # m3/MW
                                 columns=['gen', 'ip', 'lp', 'bin', 'factor'])
    # find details of the last known operating conditions
    last_idx = regression_df.groupby('well')['date_numeric'].idxmax()
    operating_conditions = regression_df.iloc[last_idx][['well', 'whp', 'h']]

    # set constants (could use median)
    fp_constants = fpdf.groupby('fp').mean().reset_index()

    if os.path.exists(configpath):
        os.remove(configpath)
    config_writer = pd.ExcelWriter('../wairakei_data/config.xlsx')
    print("Writing config data to", configpath)
    configdata = {'well_fp_map': well_fp_map,
                  'fp_gen_map': fp_gen_map,
                  'operating_conditions': operating_conditions,
                  'fp_constants': fp_constants,
                  'gen_constants': gen_constants}
    for sheetname, df in configdata.items():
        df.to_excel(config_writer, sheetname, index=False)
    config_writer.save()
    return pd.ExcelFile(configpath)


try:
    config = pd.ExcelFile(configpath)
except FileNotFoundError:
    print("Warning: are you sure you want to overwrite the config file?")
    config = write_config(configpath)

configdata = pd.read_excel(config, None)
locals().update(configdata)

well_fp_map = read_binary_solution()
# remove absent wells and fps from data before proceeding
well_fp_map = well_fp_map[well_fp_map['well'].isin(wells)].sort_values('well')
operating_conditions = operating_conditions[operating_conditions['well'].isin(wells)].sort_values('well')
fp_constants = fp_constants[fp_constants['fp'].isin(well_fp_map['fp'])].sort_values('fp')
fp_gen_map = fp_gen_map[fp_gen_map['fp'].isin(well_fp_map['fp'])].sort_values('fp')
gen_constants = gen_constants[gen_constants['gen'].isin(fp_gen_map.values.ravel())]

In [None]:
def make_data(regression_df=regression_df, well_fp_map=well_fp_map, fp_gen_map=fp_gen_map, op_cond=operating_conditions,
              fp_constants=fp_constants, gen_constants=gen_constants, date_numeric_pred=dt2num(datetime.now())):
    # make well regression data
    well_df = regression_df.copy()[['well', 'date_numeric', 'whp', 'mf']]
    well_df['well_id'], unique_wells = well_df['well'].factorize()
    well_df = well_df.drop('well', 1)
    well_df['well_id'] += 1
    unique_wells_dict = {k: v+1 for v, k in enumerate(unique_wells)}

    # set up current-date well conditions for predictions
    op_df = op_cond.copy().rename(columns={'whp': 'whp_pred'})
    op_df['well_id'] = op_df['well'].replace(unique_wells_dict)
    op_df = op_df.sort_values('well_id').drop(['well', 'well_id'], 1)

    # set up data for flash plant
    fp_df = fp_constants.copy()
    fp_df['fp_id'], unique_fps = fp_df['fp'].factorize()
    fp_df = fp_df.drop('fp', 1)
    fp_df['fp_id'] += 1
    unique_fps_dict = {k: v+1 for v, k in enumerate(unique_fps)}

    # each fp draws from which wells: {fp: [wells]}
    well_map_df = well_fp_map.copy()
    well_map_df['fp_id'] = well_map_df['fp'].replace(unique_fps_dict)
    well_map_df['well_id'] = well_map_df['well'].replace(unique_wells_dict)
    well_map_df = well_map_df.groupby('fp_id')['well_id'].apply(list).to_frame()
    well_map_well_id = well_map_df['well_id']
    well_map_dict = well_map_df.drop('well_id', 1).to_dict('list')
    well_map_dict['n_fp_inflows'] = [len(x) for x in well_map_well_id]
    # use ones to avoid JAGS indexing trouble if no inflows (extras ignored with n_inflows)
    well_map_dict['well_fp_map'] = np.ones(
        (len(well_map_well_id), max(well_map_dict['n_fp_inflows'], default=0)), int)
    for i, inflows in enumerate(well_map_well_id):
        well_map_dict['well_fp_map'][i, :well_map_dict['n_fp_inflows'][i]] = inflows

    # set up data for generators
    gen_df = gen_constants.copy()[['gen', 'factor']]
    gen_df['gen_id'], unique_gens = gen_df['gen'].factorize()
    gen_df = gen_df.drop('gen', 1)
    gen_df['gen_id'] += 1
    unique_gens_dict = {k: v+1 for v, k in enumerate(unique_gens)}

    # each gen draws from FPs
    gen_map_df = fp_gen_map.copy()
    gen_map_df.replace(unique_gens_dict, inplace=True)
    gen_map_df.replace(unique_fps_dict, inplace=True)
    gen_maps = [gen_map_df[['fp', k]].groupby(k)['fp'].apply(list).to_frame().rename(columns={'fp': f'{k}'})
                for k in gen_map_df.columns[1:]]
    gen_maps = pd.concat(gen_maps, axis=1)
    for name, column in gen_maps.iteritems():
        gen_maps[f'n_{name}_inflows'] = [np.sum(~np.isnan(x)) for x in column]
    gen_map_dict = {}
    for k in ['gen_' + p for p in ['ip', 'lp', 'w']]:
        gen_map_dict[f'fp_{k}_map'] = np.ones(
            (len(gen_maps), max(1, max(gen_maps[f'n_{k}_inflows']))), int)
        for i, inflows in enumerate(gen_maps[f'{k}'].values):
            gen_map_dict[f'fp_{k}_map'][i, :gen_maps.iloc[i][f'n_{k}_inflows']] = inflows
        gen_map_dict.update(
            {f'n_{k}_inflows': gen_maps[f'n_{k}_inflows'].tolist()})
    print(gen_map_dict)
    # collate data into one dictionary and add naming metadata
    data = {k: v for d in [e.to_dict('list')
                           for e in [well_df, op_df, fp_df, gen_df]] + [well_map_dict, gen_map_dict]
            for k, v in d.items()}
    data.update({'n_data': len(well_df), 'n_wells': len(unique_wells),
                 'n_fps': len(unique_fps), 'n_gens': len(unique_gens),
                 'today_numeric': date_numeric_pred})
    metadata = {'unique_wells': list(unique_wells), 'unique_fps': list(
        unique_fps), 'unique_gens': list(unique_gens)}
    return data, metadata


_ = make_data()[0]

In [None]:
code = '''
model {
  # fit individual regressions to wells
  for (i in 1:n_data) {
    mu[i] <- Intercept[well_id[i]] + beta_whp[well_id[i]] * whp[i] + beta_date_numeric[well_id[i]] * date_numeric[i]
    mf[i] ~ dnorm(mu[i], tau[well_id[i]])
  }
  
  # HIERARCHICAL
  for (j in 1:n_wells) {
    Intercept[j] ~ dnorm(mu_Intercept, tau_Intercept)
    beta_whp[j] ~ dnorm(mu_beta_whp, tau_beta_whp)
    beta_date_numeric[j] ~ dnorm(mu_date_numeric, tau_date_numeric)
    tau[j] ~ dgamma(1e-8, 1e-8)
  }
  
  # set hyperparameters
  mu_Intercept ~ dnorm(0, 1e-8)
  mu_beta_whp ~ dnorm(0, 1e-8)
  mu_date_numeric ~ dnorm(0, 1e-8)
  tau_Intercept ~ dgamma(1e-8, 1e-8)
  tau_beta_whp ~ dgamma(1e-8, 1e-8)
  tau_date_numeric ~ dgamma(1e-8, 1e-8)
  
  # ... NOT... HIERARCHICAL?
  #for (j in 1:n_wells) {
  #  Intercept[j] ~ dnorm(0, 1e-8)
  #  beta_whp[j] ~ dnorm(0, 1e-8)
  #  beta_date_numeric[j] ~ dnorm(0, 1e-8)
  #}
  
  # make predictions
  for (i in 1:n_wells) {
    well_mu[i] <- Intercept[i] + beta_whp[i] * whp_pred[i] + beta_date_numeric[i] * today_numeric
    well_mf[i] <- well_mu[i]
  }
  
  # estimate steam at FPs using weighted sums
  for (k in 1:n_fps) {
    fp_mf[k] <- sum(well_mf[well_fp_map[k, 1:n_fp_inflows[k]]])
    denom[k] <- ifelse(fp_mf[k]!=0, fp_mf[k], 1)
    unnormed[k] <- sum(well_mf[well_fp_map[k, 1:n_fp_inflows[k]]] * h[well_fp_map[k, 1:n_fp_inflows[k]]])
    fp_h[k] <- unnormed[k] / denom[k]
    #fp_h[k] <- sum(well_mf[well_fp_map[k, 1:n_fp_inflows[k]]] * h[well_fp_map[k, 1:n_fp_inflows[k]]]) / max(fp_mf[k], 1e-4)
    fp_ip_sf[k] <- (fp_h[k] - hf_ip[k]) / hfg_ip[k] * fp_mf[k]
    fp_lp_sf[k] <- (hf_ip[k] - hf_lp[k]) / hfg_lp[k] * (fp_mf[k] - fp_ip_sf[k])
    fp_sf[k] <- fp_ip_sf[k] + fp_lp_sf[k]
    fp_wf[k] <- fp_mf[k] - fp_sf[k]
  }
  
  # estimate power output
  for (l in 1:n_gens) {
    gen_ip_sf[l] <- sum(fp_ip_sf[fp_gen_ip_map[l, 1:n_gen_ip_inflows[l]]])
    gen_lp_sf[l] <-sum(fp_lp_sf[fp_gen_lp_map[l, 1:n_gen_lp_inflows[l]]])
    gen_wf[l] <- sum(fp_wf[fp_gen_w_map[l, 1:max(n_gen_w_inflows[l], 1)]])
    gen_f[l] <- gen_ip_sf[l] + gen_lp_sf[l] + gen_wf[l]
    gen_pw[l] <- gen_f[l] / factor[l]
  }
  pw <- sum(gen_pw)
}
'''

In [None]:
# pyjags.load_module('glm')
n_chains = 1
burn_in = 1000
myvars = ['well_mf', 'fp_mf', 'fp_h', 'fp_sf', 'gen_pw']

data, metadata = make_data()
model_fp = MyModel(code, data=data, chains=n_chains)
model_fp.sample(burn_in)
samples = model_fp.sample(5000, myvars)
# samples = {k: pd.DataFrame(v.squeeze().T) for k, v in samples.items()}
for i, (k, v) in enumerate(samples.items()):
    samples[k] = pd.DataFrame(v.squeeze().T)
    legend = None
    for names in metadata.values():
        # sometimes works.
        if len(samples[k].columns) == len(names):
            samples[k].columns = names
            legend = names
    samples[k] = samples[k].melt(var_name='name', value_name=k)
    samples[k].legend = legend
    samples[k].value_name = k

In [None]:
print("Plotting...")
ncols = 3
nrows = np.ceil(len(samples.keys())/3).astype(int)
fig, axes = plt.subplots(nrows, ncols, figsize=[14,4*nrows])

for df, ax in zip(samples.values(), fig.axes):
    for name in df['name'].unique():
        sns.kdeplot(central(df[df.value_name].loc[df['name']==name]), shade=True, ax=ax)
    for facility in ['well', 'fp', 'gen']:
        if facility in df.value_name:
            ax.legend(metadata[f'unique_{facility}s'])
    ax.set_xlabel(df.value_name)

for ax in fig.axes:
    ax.set_xlim(left=0)
    ax.get_yaxis().set_visible(False)

In [None]:
modeltext = '''
model {
a <- sum(x[1:3])
}
'''
data = {'x': [10, 20, 40]}
foo = MyModel(modeltext, data, chains=1)
samples = foo.sample(1, ['a'])
samples

In [None]:
# regression_df.to_excel('../wairakei_data/data.xlsx', 'data', index=False)

# Read dry wells data

In [None]:
# Check if Excel file is already in memory (loading is slow)
try:    xl2
except: xl2 = pd.ExcelFile('../wairakei_data/short version Generation Projection 2016.xlsx')
sheetlist = [x for x in xl2.sheet_names]
print("Sheets:", ', '.join(sheetlist))

In [None]:
try: air_raw
except: air_raw = xl2.parse('AIR', header=3)
air = air_raw.dropna(thresh=10)
air.to_excel('air.xlsx')
air

In [None]:
try: summary_raw
except: summary_raw = xl2.parse('SUMMARY', header=3)
summary = summary_raw.dropna(thresh=1)
summary