In [None]:
from utils import *
from augmentation import *

In [2]:
def features_viterbi(v):
    """
    Extract features (row sums, hough and lbp) from Viterbi transformation
    """
    return (v.sum(axis=1), extract_lines(v), viterbi2lbp(v))

def features(spec):
    """
    Handles input dataframe to extract features
    """
    return [features_viterbi(v) for v in viterbi_2w(spec)]

def augmented_features(record, df):
    spec = spectrogram_normalization(df)
    return [
        features(spec.copy())
    ] + [
        features(augment(spec.copy())) for i in range(10-1)
    ]

def sums(df):
    """
    Extract sum features (peak ratio, area ratio) from data row sums
    """
    return pd.DataFrame([[peaks(somma,5,1), areas(somma,5,1)] for somma in df.values], index=df.index, columns=['p1', 'p2'])

def raw2df(data, prefix, train:bool):
    """
    Converts the utility-returned objects to dataframes.
    Only handles 1-way (Viterbi transformation on forward or backward spectrogram) at once.

    df: dataframe containing sum, and hough features
    df_lbp: dataframe containing lbp features
    """
    idx = [x[0] for x in data]
    df_sums = pd.DataFrame([x[1][0] for x in data], index=idx)
    # df_describe = df_sums.T.describe().T.rename(lambda x:f'{prefix}_{x}', axis=1)
    df_sums = sums(df_sums).rename(lambda x:f'{prefix}_{x}', axis=1)
    df_hough = pd.DataFrame([x[1][1] for x in data], index=idx).rename(lambda x:f'{prefix}_{x}', axis=1)
    df_lbp = pd.DataFrame([x[1][2] for x in data], index=idx) + (1e-9)
    df_lbp = df_lbp.rename(lambda x:f'{prefix}_{x}', axis=1)
    df = pd.concat((df_sums, df_hough), axis=1)
    if train:
        for tdf in [df, df_lbp]:
            tdf['augm_idx'] = list(range(10))*(tdf.shape[0]//10)
            tdf.set_index([tdf.index, 'augm_idx'], inplace=True)
            tdf.index.set_names(['record','augm_idx'], inplace=True)
    return df, df_lbp

def features_mixed(df_fw, df_bw, data_fw, data_bw):
    """
    Compute mixed features from both ways transformations
    """
    p1s = []
    p2s = []
    pam = []
    for x,y in zip(data_fw, data_bw):
        xx = x[1][0]
        yy = y[1][0]
        pam.append(np.abs(np.argmax(xx) - np.argmax(yy)))
        xx[xx<0] = 0
        yy[yy<0] = 0
        zz = xx*yy
        p1s.append(peaks(zz))
        p2s.append(areas(zz))
    data = [
        np.abs(df_fw['fw_rho']-df_bw['bw_rho']),
        # np.abs(df_fw['fw_theta']-df_bw['bw_theta']),
        [np.dot(x[1][0]/np.linalg.norm(x[1][0]), y[1][0]/np.linalg.norm(y[1][0])) for x,y in zip(data_fw, data_bw)],
        p1s,
        p2s,
        pam
    ]
    columns = [
        'err_rho',
        # 'err_theta',
        'sum_corr',
        'sum_p1',
        'sum_p2',
        'peak_dist'
    ]
    df = pd.DataFrame(np.array(data).T, index=df_fw.index, columns=columns)
    return df

def read_dataframes(preprocess, train:bool):
    global data
    global data_fw
    global data_bw
    data = readcgws(records=records, preprocess=preprocess, train=train)
    if train:
        data_fw = [(x[0], y[0], x[2]) for x in data for y in x[1]]
        data_bw = [(x[0], y[1], x[2]) for x in data for y in x[1]]
    else:
        data_fw = [(x[0], x[1][0], x[2]) for x in data]
        data_bw = [(x[0], x[1][1], x[2]) for x in data]
    df_fw, df_lbp_fw = raw2df(data_fw, 'fw', train)
    df_bw, df_lbp_bw = raw2df(data_bw, 'bw', train)
    df_mix = features_mixed(df_fw, df_bw, data_fw, data_bw)
    df = pd.concat((df_fw, df_bw, df_mix), axis=1)
    df_lbp = pd.concat((df_lbp_fw, df_lbp_bw), axis=1)
    df.drop(['fw_theta', 'bw_theta'], axis=1, inplace=True)
    if train:
        targets = pd.DataFrame(np.array([x[-1] for x in data]), index=df.index.unique(level='record'), columns=['target'])
        return df, df_lbp, targets
    else:
        return df, df_lbp

def get_dfs(train:bool) -> tuple:
    if train:
        return read_cache_dfs(
            infiles=[f'{x}_train.csv' for x in ['df', 'df_lbp', 'targets']],
            read_function=lambda : read_dataframes(preprocess=augmented_features, train=train),
            index_col=[0,1],
            outfiles=[f'{x}_train.csv' for x in ['df', 'df_lbp', 'targets']]
        )
    else:
        return read_cache_dfs(
            infiles=[f'{x}_test.csv' for x in ['df', 'df_lbp']],
            read_function=lambda : read_dataframes(preprocess=features, train=train),
            index_col=0,
            outfiles=[f'{x}_test.csv' for x in ['df', 'df_lbp']]
        )

Read, augment and extract features and save the preprocessed records

In [None]:
records = read_labels()
records = records[records['target']!=-1].index.values
df, df_lbp, targets = get_dfs(train=True)  # read and save