In [None]:
import glob
from zipfile import BadZipFile

import numpy as np
import pandas as pd

from pysida.lib import DAY_SECONDS

from music_nextsim_tuning import FeatureMerger

In [None]:
fm = FeatureMerger()
fm.size_lims = np.array([
    [0, 5],
    [5, 10],
    [10, 15],
    [15, 20],
    [20, 25],
])
fm.edges_vec = [2,3,4,5,6,7,8,9]
fm.d_vec = [1,2,4,8]
fm.propnames = [
    'dissimilarity',
    'homogeneity',
    'ASM',
    'energy',
    'correlation',
    'contrast',
]

feat_col_names = (
    ['div_50', 'div_90', 'cnv_50', 'cnv_90', 'she_50', 'she_90'] +
    [f'a50_{sl[0]:02}' for sl in fm.size_lims] +
    [f'a90_{sl[0]:02}' for sl in fm.size_lims] +
    [f'{propname[:3]}_{d:02}' for propname in fm.propnames for d in fm.d_vec] +
    ['mom_1o', 'mom_1s', 'mom_2o', 'mom_2s', 'mom_3o', 'mom_3s'] +
    ['lkf_an', 'lkf_ln', 'lkf_no']
)
print(len(feat_col_names), feat_col_names)

In [None]:
# EXP_01
idir = './music_matrix/cfg01_m20'
odir = './music_matrix/cfg01_m20'
exp_name = 'tru_cfg_01'
param_names = ['compression_factor', 'C_lab', 'nu0', 'tan_phi']
skip = ['mat09']

In [None]:
# READ ALL FEATURES FROM ALL MEMBERS
pfiles = sorted(glob.glob(f'{idir}/*pairs.npz'))
print(len(pfiles), pfiles[0], pfiles[-1])

features_n = {}
dates_n = {}
for pfile in pfiles:
    try:
        pairs, defor, aniso, props, momes, lkfs, dates = fm.load_data(pfile, skip)
    except (ValueError, BadZipFile) as e:
        print(e)
        print(pfile, 'is not processed')
    else:
        features = fm.read_features(pairs, defor, aniso, props, momes, lkfs, dates)
        features, dates = fm.get_valid_features_dates(features)
        member_id = pfile.split('/')[-1].split('_')[0].replace('mat','')
        features_n[member_id] = features
        dates_n[member_id] = dates

In [None]:
# GENERATE and SAVE TRAINING DATA FROM NEXTSIM
param_vals = fm.get_param_vals(exp_name, param_names)
training_features, training_labels = fm.merge_features_labels(param_vals, features_n, dates_n, param_names)
trn_f_df = pd.DataFrame(training_features, columns=feat_col_names + ['date'])
trn_l_df = pd.DataFrame(training_labels, columns=param_names)

pd.to_pickle(trn_f_df, f'{odir}/ftrs.pickle')
pd.to_pickle(trn_l_df, f'{odir}/lbls.pickle')

In [None]:
# TEST READING
inp_ftrs = pd.read_pickle(f'{odir}/ftrs.pickle')#.drop(columns=['date']).astype(float)
inp_lbls = pd.read_pickle(f'{odir}/lbls.pickle')#.astype(float)

In [None]:
# READ AND SAVE RGPS DATA
pfile = './music_matrix/rgps/w98_may_pairs.npz'
ofile = pfile.replace('pairs.npz', 'ftrs.pickle')
skip = 'skip'
pairs, defor, aniso, props, momes, lkfs, dates = fm.load_data(pfile, skip, skip_lkfs=True)
features = fm.read_features(pairs, defor, aniso, props, momes, lkfs, dates)
features, dates = fm.get_valid_features_dates(features)
rgps_df = pd.DataFrame(np.hstack([features, np.array(dates)[None].T]), columns=feat_col_names + ['date'])
print(ofile)
pd.to_pickle(rgps_df, ofile)

In [None]:
inp_rgps = pd.read_pickle('./music_matrix/rgps/w07_may_ftrs.pickle')#.drop(columns=['date']).astype(float)
