In [None]:
%pylab notebook
import numpy as np
import pandas as pd
import re
pd.options.display.max_rows = 20000

In [None]:
def move_header_to_top(filename):
    # Read the whole file
    with open(filename) as f:
        lines = f.readlines()

    # Find the header and its position
    re_header = re.compile(r'DRAINAGE')
    header_pos = 0
    for i, line in enumerate(lines):
        if re_header.match(line):
            print("Found header at position=%d" % i)
            header_pos = i
            header = line
            break
            
    # Delete the header from its original position
    # And insert it at the beginnin
    if header_pos > 0:
        del lines[header_pos]
        lines.insert(0, header)
        
    # Write out the new file
    f = open(filename, 'w')
    for line in lines:
        f.write(line)
    f.close
    
    print("Moved header to top of file.")

In [None]:
%cd /Users/brodzik/projects/CHARIS/charistools_test_data/calibration_stats/
%ls
#filename = 'AM_Vakhsh_calibration-1510950-0.out'
filename = 'IN_Hunza_calibration-1511444-1.out'
#filename = 'GA_Narayani_calibration-1511446-1.out'
#filename = 'GA_SaptaKosi_calibration-1511447-1.out'
#filename

In [None]:
# move_header_to_top(filename)

## Read the calibration stats

Concatenate the DDFs into a model string for each row.  
Since we're calibrating on multiple years, there will be multiple years with the same model string

In [None]:
df = pd.read_table(filename, sep='\s+')
subdf = df[['DRAINAGEID','YYYY','min_snow_ddf','max_snow_ddf','min_ice_ddf','max_ice_ddf','Monthly_rmse_km3','Annual_voldiff_pcent']]

In [None]:
subdf.loc[:,"model"] = (
    subdf["min_snow_ddf"].map(str) + "_" + 
    subdf["max_snow_ddf"].map(str) + "_" +
    subdf["min_ice_ddf"].map(str) + "_" +
    subdf["max_ice_ddf"].map(str))

Calculate average volDiff and RMSE by modelid (over multiple years)

In [None]:
mean_vol_diff = subdf.groupby(['model']).mean()['Annual_voldiff_pcent']
mean_rmse = subdf.groupby(['model']).mean()['Monthly_rmse_km3']

Collect the averaged stats into a new DataFrame

In [None]:
new = mean_rmse.to_frame()
new['Annual_voldiff_pcent'] = mean_vol_diff
# new

In [None]:
new.describe().loc[['max','min'],['Monthly_rmse_km3','Annual_voldiff_pcent']]

<h2>Now, normalize the two variables so they range from 0.0 to 1.0</h2>

Note that Annual voldiff is signed, and we looking for voldiff close to zero (on either side of zero).

This should map 0. to 0. and max(|min_vol_diff|,|max_vol_diff|) to 1.0

and             min_rmse to 0. and max_rmse to 1.0:


In [None]:
# Take the absolute value of volumetric difference
new['Abs_voldiff'] = np.abs(new['Annual_voldiff_pcent'])
biggest_vol_diff = np.max(new['Abs_voldiff'])
print("biggest_vol_diff=%f" % biggest_vol_diff)

In [None]:
min_rmse = np.min(new['Monthly_rmse_km3'])
max_rmse = np.max(new['Monthly_rmse_km3'])

new['z_Vol_Diff'] = new['Abs_voldiff'] / biggest_vol_diff
new['z_RMSE'] = (
    (new['Monthly_rmse_km3'] - min_rmse) / 
    (max_rmse - min_rmse))
new['z'] = new['z_Vol_Diff'] + new['z_RMSE']
#new

Now calculate the combined statistic (z_vol_diff + z_rmse) and find the minimum:


In [None]:
print("min/max z_Vol_Diff=%f - %f" % (np.min(new['z_Vol_Diff']), 
                                      np.max(new['z_Vol_Diff'])))
print("min/max z_RMSE=%f - %f" % (np.min(new['z_RMSE']), 
                                      np.max(new['z_RMSE'])))

In [None]:
sorted = new.sort_values(by=['z'], ascending=True)

In [None]:
print("DDF ranges included in this file:")
subdf.describe().loc[['max','min'],['min_snow_ddf','max_snow_ddf','min_ice_ddf','max_ice_ddf']]

In [None]:
print(sorted.iloc[0])
print("Best model is %s" % sorted.index[0])

In [None]:
fig, ax = plt.subplots(1)
sorted['z'][:400].plot(ax=ax)
sorted['z_Vol_Diff'][:400].plot(ax=ax)
sorted['z_RMSE'][:400].plot(ax=ax)
ax.legend(loc='best')
ax.set_title('Best calibration stats')

In [None]:
sorted[0:10]