In [None]:
%pylab notebook
from __future__ import print_function
import numpy as np
import pandas as pd
import re
import sys
pd.options.display.max_rows = 100

In [None]:
def move_header_to_top(filename, verbose=False):
    # Read the whole file
    with open(filename) as f:
        lines = f.readlines()

    # Find the header and its position
    re_header = re.compile(r'DRAINAGE')
    header_pos = 0
    for i, line in enumerate(lines):
        if re_header.match(line):
            print("Found header at position=%d" % i)
            header_pos = i
            header = line
            break
            
    # Delete the header from its original position
    # And insert it at the beginnin
    if header_pos > 0:
        del lines[header_pos]
        lines.insert(0, header)
        
    # Write out the new file
    f = open(filename, 'w')
    for line in lines:
        f.write(line)
    f.close
    
    if verbose:
        print("%s: Moved header to top of file." % filename)

In [None]:
def cat_and_remove_dup_headers(outfilename, filenames):
    
    outFile = open(outfilename, 'w')
    
    for i, filename in enumerate(filenames):

        with open(filename) as f:
            lines = f.readlines()

        # Find the header and its position
        re_header = re.compile(r'DRAINAGE')
        header_pos = 0
        for j, line in enumerate(lines):
            if re_header.match(line):
                header_pos = j
                header = line
                break
                    
        # Delete the header from its original position
        print("%s: Deleting header at position=%d" % (filename, header_pos))
        del lines[header_pos]

        # Insert the header back at the beginning of the first file only
        if i == 0:
            print("%s: Writing header to top of file" % filename)
            lines.insert(0, header)
        
        for line in lines:
            outFile.write(line)
            
    outFile.close


In [None]:
%cd /Users/brodzik/charis/calibration_stats/modscag_v09_3strikes
%ls
filename = 'AM_Vakhsh_calibration.out'
#filename = 'IN_Hunza_calibration.out'
#filename = 'GA_Narayani_calibration-1511446-1.out'
#filename = 'GA_SaptaKosi_calibration-1511447-1.out'
#filename

In [None]:
move_header_to_top(filename)

In [None]:
def find_best_model(filename, verbose=False):

    if verbose:
        print("Finding best model from: %s" % filename, file=sys.stdout)
        
    # Read the calibration stats and filter for only the columns we plan to use
    df = pd.read_table(filename, sep='\s+')
    subdf = df[['DRAINAGEID','YYYY',
                'min_snow_ddf','max_snow_ddf','min_ice_ddf','max_ice_ddf',
                'Monthly_rmse_km3','Annual_voldiff_pcent']]
    
    # Make a new column with DDFs concatenated into a model string
    # (when multiple years are used for calibrations there will be multiple
    # rows with the same model string)
    subdf.loc[:,"model"] = (
        subdf["min_snow_ddf"].map(str) + "_" + 
        subdf["max_snow_ddf"].map(str) + "_" +
        subdf["min_ice_ddf"].map(str) + "_" +
        subdf["max_ice_ddf"].map(str))

    # Remove duplicates that may have been produced on multiple 
    # calibration runs 
    nrows_before = subdf.shape[0]
    dups = subdf.duplicated(keep='first')
    subdf = subdf[~dups]
    if verbose:
        print("\nNumber of duplicate rows removed=%d" % (nrows_before - subdf.shape[0]))
        
    # Calculate average volDiff and RMSE by modelid (over multiple years)
    # Collect the averaged stats into a new DataFrame
    mean_vol_diff = subdf.groupby(['model']).mean()['Annual_voldiff_pcent']
    mean_rmse = subdf.groupby(['model']).mean()['Monthly_rmse_km3']
    stats_df = mean_rmse.to_frame()
    stats_df['Annual_voldiff_pcent'] = mean_vol_diff
    
    # Now, normalize the two variables so they range from 0.0 to 1.0
    # Note that Annual voldiff is signed, and we are looking for 
    # voldiff close to zero (on either side of zero).
    # This should map 0. voldiff to 0. and max(|min_vol_diff|,|max_vol_diff|) to 1.0
    # and min_rmse to 0. and max_rmse to 1.0
    stats_df['Abs_voldiff'] = np.abs(stats_df['Annual_voldiff_pcent'])
    biggest_vol_diff = np.max(stats_df['Abs_voldiff'])
    
    min_rmse = np.min(stats_df['Monthly_rmse_km3'])
    max_rmse = np.max(stats_df['Monthly_rmse_km3'])

    stats_df['z_Vol_Diff'] = stats_df['Abs_voldiff'] / biggest_vol_diff
    stats_df['z_RMSE'] = (
        (stats_df['Monthly_rmse_km3'] - min_rmse) / (max_rmse - min_rmse))
    stats_df['z'] = stats_df['z_Vol_Diff'] + stats_df['z_RMSE']
    
    # Now calculate the combined statistic (z_vol_diff + z_rmse) and find the minimum:
    if verbose:
        print("\nStatistics ranges in this file:")
        print(stats_df.describe().loc[['max','min'],
                                      ['Monthly_rmse_km3','Annual_voldiff_pcent','z_Vol_Diff', 'z_RMSE']])

    stats_df.sort_values(by=['z'], ascending=True, inplace=True)
    
    # Summarize results
    if verbose:
        print("\nDDF ranges included in this file:")
        print(subdf.describe().loc[['max','min'],['min_snow_ddf','max_snow_ddf','min_ice_ddf','max_ice_ddf']])
    
    # Convert index to a set to get the unique values
    # and dump the stats for the model that minimizes z
    uniq_models = set(stats_df.index)
    if verbose:
        print()
        print(stats_df.iloc[0])
        print("\nNumber of models considered: %d" % len(uniq_models))
        print("\nBest model is %s" % stats_df.index[0])
        
    return(stats_df)

In [None]:
stats_df = find_best_model(filename, verbose=True)

In [None]:
fig, ax = plt.subplots(1)
stats_df['z'][:400].plot(ax=ax)
stats_df['z_Vol_Diff'][:400].plot(ax=ax)
stats_df['z_RMSE'][:400].plot(ax=ax)
ax.legend(loc='best')
ax.set_title('Best calibration stats')