
The goal of this notebook is to:
* generate HTM traces
* load the traces
* order by label and compute the inter-label distances
* plot the distance matrix

In [1]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches
import seaborn as sns

%matplotlib inline

In [2]:
# params
SP_OUTPUT_WIDTH = 2048
TM_OUTPUT_WIDTH = 65536

In [3]:
# Helper functions
def sdr_converter(sdr_width):

    def convert_sdr(patternNZ_strings):
        patternNZs = json.loads(patternNZ_strings)
        sdrs = []
        for patternNZ in patternNZs:
            sdr = np.zeros(sdr_width)
            sdr[patternNZ] = 1
            sdrs.append(sdr.tolist())
        return np.array(sdrs)
    return convert_sdr


def load_df(file_path):
    return pd.read_csv(file_path, converters={
        'spActiveColumns': sdr_converter(SP_OUTPUT_WIDTH),
        'tmPredictedActiveCells': sdr_converter(TM_OUTPUT_WIDTH)})


def sequence_embedding(sdrs_sequence, aggregation):
    if aggregation == 'or':
        embedding = sdrs_sequence[0]
        for sdr in sdrs_sequence:
            embedding = np.logical_or(embedding, sdr)
        return embedding.astype(int)
    elif aggregation == 'and':
        embedding = sdrs_sequence[0]
        for sdr in sdrs_sequence:
            embedding = np.logical_and(embedding, sdr)
        return embedding.astype(int)
    elif aggregation == 'mean':
        return np.mean(sdrs_sequence, axis=0)
    else:
        raise ValueError('Wrong SDR aggregation name.')

        
def chunked_sequence_embedding(sdrs_sequence, aggregation, nb_chunks=1):
    chunk_size = len(sdrs_sequence) / nb_chunks
    embedding = []
    for i in range(nb_chunks):
        chunk = sdrs_sequence[i*chunk_size:(i+1)* chunk_size]
        embedding.append(sequence_embedding(chunk, aggregation))
    embedding = np.array(embedding)
    embedding = embedding.flatten()
    return embedding


def sequence_embeddings(sdrs_sequences, aggregation, nb_chunks):
    embeddings = []
    for sdrs_sequence in sdrs_sequences:
        #embedding = sequence_embedding(sdrs_sequence, aggregation)
        embedding = chunked_sequence_embedding(sdrs_sequence, aggregation, nb_chunks)
        embeddings.append(embedding)
    return embeddings


def euclidian_distance(x1, x2):
    return np.linalg.norm(x2-x1) # L2 norm


def distance_mats(col_embeddings, cell_embeddings, cell_w=1.0, col_w=1.0, distance=euclidian_distance):
    assert len(col_embeddings) == len(cell_embeddings)
    nb_sequences = len(col_embeddings)
    col_mat = np.zeros((nb_sequences, nb_sequences), dtype=np.float64)
    cell_mat = np.zeros((nb_sequences, nb_sequences), dtype=np.float64)
    combined_mat = np.zeros((nb_sequences, nb_sequences), dtype=np.float64)
    for i in range(nb_sequences):
        for j in range(i, nb_sequences):
            col_dist = distance(col_embeddings[i], col_embeddings[j])
            cell_dist = distance(cell_embeddings[i], cell_embeddings[j])
            col_mat[i, j] = col_dist
            cell_mat[i, j] = cell_dist
            combined_mat[i, j] = (col_w * col_dist + cell_w * cell_dist) / (col_w + cell_w)

            col_mat[j, i] = col_mat[i, j]
            cell_mat[j, i] = cell_mat[i, j]
            combined_mat[j, i] = combined_mat[i, j]
    return col_mat, cell_mat, combined_mat


def project_embeddings(embeddings, embedding_name):
    tsne = TSNE(metric='euclidean', n_iter=1000, init='pca')
    return tsne.fit_transform(embeddings)

In [None]:
import os; print 'Current working directory:', os.getcwd()
%run network_runner_sequences.py --config configs/debug.yml
%run network_runner_sequences.py --config configs/body_acc_x.yml
%run network_runner_sequences.py --config configs/synthetic_control.yml
%run network_runner_sequences.py --config configs/test1.yml

INFO:NetworkRunner:Input file: body_acc_x_TRAIN
INFO:NetworkRunner:Traces saved in: traces_debug/
INFO:NetworkRunner:Input file: body_acc_x_TEST
INFO:NetworkRunner:Traces saved in: traces_debug/


Current working directory: /Users/mleborgne/_git/nupic.research/projects/capybara/htm


INFO:NetworkRunner:Input file: body_acc_x_TRAIN
INFO:NetworkRunner:Wrote to file (label=4, sequenceNumber=10)
INFO:NetworkRunner:Elapsed time: 0.95s
INFO:NetworkRunner:Wrote to file (label=4, sequenceNumber=20)
INFO:NetworkRunner:Elapsed time: 1.99s
INFO:NetworkRunner:Wrote to file (label=4, sequenceNumber=30)
INFO:NetworkRunner:Elapsed time: 3.41s
INFO:NetworkRunner:Wrote to file (label=3, sequenceNumber=40)
INFO:NetworkRunner:Elapsed time: 4.86s
INFO:NetworkRunner:Wrote to file (label=3, sequenceNumber=50)
INFO:NetworkRunner:Elapsed time: 6.62s
INFO:NetworkRunner:Wrote to file (label=3, sequenceNumber=60)
INFO:NetworkRunner:Elapsed time: 8.34s
INFO:NetworkRunner:Wrote to file (label=5, sequenceNumber=70)
INFO:NetworkRunner:Elapsed time: 9.99s
INFO:NetworkRunner:Wrote to file (label=5, sequenceNumber=80)
INFO:NetworkRunner:Elapsed time: 11.68s
INFO:NetworkRunner:Wrote to file (label=5, sequenceNumber=90)
INFO:NetworkRunner:Elapsed time: 13.20s
INFO:NetworkRunner:Wrote to file (label=0

In [None]:
dfs = {}

In [None]:
phase = 'TRAIN'
dfs['Test1_%s' %phase] = load_df('traces/trace_Test1_%s' %phase)

In [None]:
phase = 'TRAIN'
dfs['synthetic_control_%s' %phase] = load_df('traces/trace_synthetic_control_%s' %phase)

In [None]:
# phase = 'TRAIN'
# dfs['body_acc_x_%s' %phase] = load_df('traces/trace_body_acc_x_%s' %phase)

In [48]:
# phase = 'TEST'
# dfs['Test1_%s' %phase] = load_df('traces/trace_Test1_%s' %phase)

In [None]:
# phase = 'TEST'
# dfs['synthetic_control_%s' %phase] = load_df('traces/trace_synthetic_control_%s' %phase)

In [47]:
# phase = 'TEST'
# dfs['body_acc_x_%s' %phase] = load_df('traces/trace_body_acc_x_%s' %phase)

In [None]:
def plot_mat(embeddings_mat, embedding_name, dataset_name, aggregation):
    plt.figure(figsize=(5,4))
    heatmap = plt.pcolor(embeddings_mat, cmap=plt.cm.Blues)
    plt.colorbar(heatmap)
    plt.xlabel('Sequence #')
    plt.ylabel('Sequence #')
    plt.title('%s sequence embeddings\n'
              'Data: %s\n'
              'Aggregation: %s' % (embedding_name, dataset_name, aggregation))


def plot_projections(embeddings_proj, labels, embedding_name):

    # Colors 
    unique_labels = list(set(list(labels)))
    nb_colors = len(unique_labels)
    color_names = ['Class %s' %l for l in unique_labels]
    colors = sns.color_palette('colorblind', nb_colors) 
    
    # Plot projections
    plt.figure(figsize=(5,5))
    plt.title('%s sequence embeddings 2D projections' %embedding_name)
    plt.scatter(embeddings_proj[:,0], embeddings_proj[:,1], c=[colors[unique_labels.index(l)] for l in labels])

    # Add legend
    patches = [mpatches.Patch(color=colors[i], label=color_names[i]) for i in range(nb_colors)]
    plt.legend(handles=patches, loc=2, bbox_to_anchor=(1.05, 1)) # plot the legend to the right


def analyze_traces(df, name, aggregations=['mean'], nb_chunks=10):
    for aggregation in aggregations:

        # Embeddings
        sorted_df =  df.sort_values('label') # sort by label, it's easier to read this way.
        labels = df.label.values
        sp_sdrs_sequences = sorted_df.spActiveColumns.values
        tm_sdrs_sequences = sorted_df.tmPredictedActiveCells.values
        sp_embeddings = sequence_embeddings(sp_sdrs_sequences, aggregation, nb_chunks)
        tm_embeddings = sequence_embeddings(tm_sdrs_sequences, aggregation, nb_chunks)
        assert len(sp_embeddings) == len(tm_embeddings)

        # Distance matrices
        sp_mat, tm_mat, combined_mat = distance_mats(sp_embeddings, tm_embeddings)
        
        # tSNE projections
        sp_embeddings_proj = project_embeddings(sp_embeddings, 'SP')
        tm_embeddings_proj = project_embeddings(tm_embeddings, 'TM')

        # Plot tSNE projections
        plot_projections(sp_embeddings_proj, labels, 'SP')
        plot_projections(tm_embeddings_proj, labels, 'TM')
        
        # Plot distance matrices
        plot_mat(sp_mat, 'SP', name, aggregation)
        plot_mat(tm_mat, 'TM', name, aggregation)
        plot_mat(combined_mat, 'Combined', name, aggregation)


In [None]:
name = 'Test1_%s' % phase
df = dfs[name]
analyze_traces(df, name, nb_chunks=10)

In [None]:
name = 'synthetic_control_%s' % phase
df = dfs[name]
analyze_traces(df, name, nb_chunks=20)

In [None]:
name = 'body_acc_x_%s' % phase
df = dfs[name]
analyze_traces(df, name, nb_chunks=10)