In [None]:
import os
import glob
from datetime import datetime
from zipfile import BadZipFile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pysida.lib import DAY_SECONDS

In [None]:
size_lims = np.array([
    [0, 5],
    [5, 10],
    [10, 15],
    [15, 20],
    [20, 25],
])

edges_vec = [2,3,4,5,6,7,8,9]

d_vec = [1,2,4,8]

propnames = [
    'dissimilarity',
    'homogeneity',
    'ASM',
    'energy',
    'correlation',
    'contrast',
]


def get_input_files(pfile):
    print(pfile)
    deforfile = pfile.replace('_pairs.npz', '_defor.npz')
    anisofile = pfile.replace('_pairs.npz', '_aniso.npz')
    propfile = pfile.replace('_pairs.npz', '_texture.npz')
    scalingfile = pfile.replace('_pairs.npz', '_scale.npz')
    lkfsfile = pfile.replace('_pairs.npz', '_lkf_stats.npz')

    input_files = [
        pfile,
        deforfile,
        anisofile,
        propfile,
        scalingfile,
        lkfsfile,
    ]
    for input_file in input_files:
        if not os.path.exists(input_file):
            print('Where is ', input_file)
            raise ValueError
    return input_files

def load_data(pfile, skip):
    if pfile.split('/')[-1].split('_')[0] in skip:
        print('Skip ', pfile)
        raise ValueError
    pfile, deforfile, anisofile, propfile, scalingfile, lkfsfile = get_input_files(pfile)
    with np.load(pfile, allow_pickle=True) as f:
        pairs = f['pairs']
    with np.load(deforfile, allow_pickle=True) as f:
        defor = f['defor']
    with np.load(anisofile, allow_pickle=True) as f:
        aniso = f['aniso']
    with np.load(propfile, allow_pickle=True) as f:
        props = f['props']
    with np.load(scalingfile, allow_pickle=True) as f:
        momes = f['mmm']
        dates = list(f['dates'])
    with np.load(lkfsfile, allow_pickle=True) as f:
        lkfs = f['lkf_stats'].item()
        lkfs = pd.DataFrame(lkfs, index=lkfs['dates'])
    return pairs, defor, aniso, props, momes, lkfs, dates

def get_defor_stats(pairs, defor, pair_indeces):
    e1d_all = []
    e1c_all = []
    e2_all = []
    for i in pair_indeces:
        p = pairs[i]
        d = defor[i]
        e1 = d.e1[p.g] * DAY_SECONDS
        e2 = d.e2[p.g] * DAY_SECONDS
        e1d_all.append(np.log10(e1[e1 > 0]))
        e1c_all.append(np.log10(-e1[e1 < 0]))
        e2_all.append(np.log10(e2[e2 > 0]))
    defor_stats = np.hstack([
        np.percentile(np.hstack(e1d_all), [50, 90]),
        np.percentile(np.hstack(e1c_all), [50, 90]),
        np.percentile(np.hstack(e2_all),  [50, 90]),
    ])
    return defor_stats

def get_aniso_stats(pairs, defor, aniso, pair_indeces):
    ani_all = []
    siz_all = []
    edg_all = []

    for i in pair_indeces:
        p = pairs[i]
        d = defor[i]
        a = aniso[i]
        if a is None or len(a) == 0:
            continue
        for cnt, edges in enumerate(edges_vec):
            if f'ani|{edges}' in a:
                gpi = np.isfinite(a[f'ani|{edges}']) * (a[f'ani|{edges}'] < 1)
                ani_all.append(a[f'ani|{edges}'][gpi])
                siz_all.append(a[f'siz|{edges}'][gpi])
                edg_all.append(np.ones(gpi[gpi].size, float) * edges)

    ani_all = np.hstack(ani_all)
    siz_all = ((np.hstack(siz_all)/2)**0.5)/1000
    edg_all = np.hstack(edg_all)
    gpi = np.isfinite(ani_all) * (ani_all < 1) * (siz_all > (2*edg_all - 1))
    ani_all = ani_all[gpi]
    siz_all = siz_all[gpi]

    ani_p50 = []
    ani_p90 = []
    for size_lim in size_lims:
        gpi = (siz_all >= size_lim[0]) * (siz_all < size_lim[1])
        if gpi[gpi].size == 0:
            ani_p50.append(np.nan)
            ani_p90.append(np.nan)
        else:
            ani_p50.append(np.median(ani_all[gpi]))
            ani_p90.append(np.percentile(ani_all[gpi],90))
    aniso_stats = np.hstack([ani_p50, ani_p90])
    return aniso_stats

def get_texture_stats(props, pair_indeces):
    return props[pair_indeces].mean(axis=0).flatten()

def get_lkf_stats(lkfs, dst_date):
    return lkfs.loc[dst_date]['angles'], lkfs.loc[dst_date]['lengths'], lkfs.loc[dst_date]['counts']

def read_features(pairs, defor, aniso, props, momes, lkfs, dates):
    # dates of all pairs
    pair_dates = [datetime(p.d0.year, p.d0.month, p.d0.day) if p else None for p in pairs]
    # indices of MOM dates in the dates of all pairs
    date_indeces = np.array([dates.index(pd) if pd in dates else -1 for pd in pair_dates])
    # unique indices of MOM dates
    date_indeces_unq = np.unique(date_indeces)
    date_indeces_unq = date_indeces_unq[date_indeces_unq != -1]
    # create date index for sampling LKFs
    lkfs_i = lkfs.reindex(dates).interpolate(method='linear')

    features = {}
    for date_index in date_indeces_unq:
        if momes[date_index] is None:
            continue
        dst_date = dates[date_index]
        pair_indeces = np.where(date_indeces == date_index)[0]
        defor_stats = get_defor_stats(pairs, defor, pair_indeces)
        aniso_stats = get_aniso_stats(pairs, defor, aniso, pair_indeces)
        textu_stats = get_texture_stats(props, pair_indeces)
        scale_stats = momes[date_index]['c'].flatten()
        lkf_stats = get_lkf_stats(lkfs_i, dst_date)
        features[dst_date] = np.hstack([defor_stats, aniso_stats, textu_stats, scale_stats, lkf_stats])

    return features

def get_valid_features_dates(features):
    dates = np.array(list(features.keys()))
    features = np.vstack([features[f] for f in features])
    gpi = np.where(np.isfinite(np.sum(features, axis=1)))[0]
    valid_features = features[gpi]
    valid_dates = list(dates[gpi])
    return valid_features, valid_dates

def get_param_vals(exp_name, param_names):
    param_vals = {}
    tru_cfg_files = sorted(glob.glob(f'run_experiment/{exp_name}/sa10free_mat*cfg'))
    for tru_cfg_file in tru_cfg_files:
        exp_num = tru_cfg_file.split('/')[-1].split('.')[0].split('_')[1].replace('mat','')
        param_vals[exp_num] = {}
        with open(tru_cfg_file) as f:
            lines = f.readlines()

        for line in lines:
            for param_name in param_names:
                if param_name == line.split('=')[0]:
                    param_vals[exp_num][param_name] = float(line.strip().split('=')[1])
    return param_vals

def merge_features_labels(param_vals, features_n, dates_n):
    training_features = []
    training_labels = []

    for exp_num in param_vals:
        param_vec = [param_vals[exp_num][param_name] for param_name in param_names]
        if exp_num in features_n:
            feature_vecs = features_n[exp_num]
            dates_vec = dates_n[exp_num]
            training_features.append(np.hstack([feature_vecs, np.array(dates_vec)[None].T]))
            training_labels.append([param_vec] * len(feature_vecs))

    training_features = np.vstack(training_features)
    training_labels = np.vstack(training_labels)
    return training_features, training_labels

In [None]:
# EXP_01
idir = '../../music_matrix/cfg01_m20'
odir = '../../music_matrix/cfg01_m20'
exp_name = 'tru_cfg_01'
param_names = ['compression_factor', 'C_lab', 'nu0', 'tan_phi']
skip = ['mat09']

feat_col_names = (
    ['div_50', 'div_90', 'cnv_50', 'cnv_90', 'she_50', 'she_90'] +
    [f'a50_{sl[0]:02}' for sl in size_lims] +
    [f'a90_{sl[0]:02}' for sl in size_lims] +
    [f'{propname[:3]}_{d:02}' for propname in propnames for d in d_vec] +
    ['mom_1o', 'mom_1s', 'mom_2o', 'mom_2s', 'mom_3o', 'mom_3s'] +
    ['lkf_an', 'lkf_ln', 'lkf_no']
)

print(len(feat_col_names), feat_col_names)

In [None]:
# READ ALL FEATURES FROM ALL MEMBERS
pfiles = sorted(glob.glob(f'{idir}/*pairs.npz'))
print(len(pfiles), pfiles[0], pfiles[-1])

features_n = {}
dates_n = {}
for pfile in pfiles:
    try:
        pairs, defor, aniso, props, momes, lkfs, dates = load_data(pfile, skip)
    except (ValueError, BadZipFile) as e:
        print(e)
        print(pfile, 'is not processed')
    else:
        features = read_features(pairs, defor, aniso, props, momes, lkfs, dates)
        features, dates = get_valid_features_dates(features)
        member_id = pfile.split('/')[-1].split('_')[0].replace('mat','')
        features_n[member_id] = features
        dates_n[member_id] = dates

In [None]:
# GENERATE and SAVE TRAINING DATA FROM NEXTSIM
param_vals = get_param_vals(exp_name, param_names)
training_features, training_labels = merge_features_labels(param_vals, features_n, dates_n)
trn_f_df = pd.DataFrame(training_features, columns=feat_col_names + ['date'])
trn_l_df = pd.DataFrame(training_labels, columns=param_names)

pd.to_pickle(trn_f_df, f'{odir}/ftrs.pickle')
pd.to_pickle(trn_l_df, f'{odir}/lbls.pickle')

In [None]:
# READ AND SAVE RGPS DATA
pfile = '../../rgps/csv/w07_may_pairs.npz'
skip = 'skip'
pairs, defor, aniso, props, momes, lkfs, dates = load_data(pfile, skip)
features = read_features(pairs, defor, aniso, props, momes, lkfs, dates)
features, dates = get_valid_features_dates(features)
rgps_df = pd.DataFrame(np.hstack([features, np.array(dates)[None].T]), columns=feat_col_names + ['date'])
pd.to_pickle(rgps_df, f'{odir}/rgps.pickle')

In [18]:
inp_ftrs = pd.read_pickle(f'{odir}/ftrs.pickle')#.drop(columns=['date']).astype(float)
inp_lbls = pd.read_pickle(f'{odir}/lbls.pickle')#.astype(float)
inp_rgps = pd.read_pickle(f'{odir}/rgps.pickle')#.drop(columns=['date']).astype(float)
