## Population decoder analysis


In [None]:
import sys
sys.path.insert(0, "../scripts")

In [None]:
%env NBANK_REGISTRY https://gracula.psyc.virginia.edu/neurobank

In [None]:
import json
import logging
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dlab import pprox, nbank, spikes, plotting
import ewave
from scipy.linalg import hankel, toeplitz
import samplerate
from joblib import Memory
from appdirs import user_cache_dir
from tqdm import tqdm

import graphics_defaults
from core import df_extent

In [None]:
# joblib to cache gammatone spectrogram calculations
_cache_dir = user_cache_dir("preconstruct", "melizalab")
_mem = Memory(_cache_dir, verbose=0)
_cache_dir

In [None]:
desired_sampling_rate = 20000 # Hz
desired_time_step = 0.0025 # s
spectrogram_params = {
    "window_time": 0.005,
    "channels": 40,
    "f_min": 1000,
    "f_max": 8500,
}
spectrogram_compression = 0.01
decoder_window = (0.0, 0.2)  # s
n_basis = 20
linearity_factor = 20

In [None]:
def compare_spectrograms_cor(actual: np.ndarray, predicted: np.ndarray) -> float:
    """Compare two spectrograms using correlation coefficient across the entire stimulus"""
    cc = np.corrcoef(actual.flat, predicted.flat)
    return cc[0, 1]

## Load neural responses

Choose your own adventure here - run either the first cell to analyze all the CR or PR units, or the second cell to analyze all the units from a single site. For testing, recommend using the second cell and picking one of the sites. For Figure 6, we generate the population raster using the first cell with `site_name` set to `cr_units`, and the decoded stimuli are generated with `site_name` as `cr_units` or `pr_units`.

In [None]:
# units from a file
pprox_dir = None
site_name = "cr_units"
unit_file = Path(f"../build/{site_name}.txt")
unit_names = [line.strip() for line in open(unit_file, "rt")]

In [None]:
# load all the units for a site
site_name = "C42_4_1"
site_name = "R227_3_1"  # not very invariant
site_name = "C41_5_1"   # okay performance with few units
site_name = "C45_4_1"   # poor performance with a lot of units
site_name = "C104_4_1"  # good performance with few units
site_name = "C29_1_1"   # good performance with a lot of units
pprox_dir = None
unit_names = [record["name"] for record in nbank.search(nbank.default_registry, name=site_name, dtype="spikes-pprox")]                                

In [None]:
all_trials = []
for unit_name, pprox_file in tqdm(nbank.find_resources(*unit_names, alt_base=pprox_dir), total=len(unit_names)):
    # this will raise an error if the file was not found
    pprox_data = json.loads(pprox_file.read_text())
    # only clean stimuli
    all_trials.extend(trial | {"unit": unit_name} for trial in pprox_data["pprox"])

### Split up the trials by motif

Stimuli comprise sequences of 10 motifs with order counterbalanced using a latin square to average out order effects and ensure each motif is presented against a different segment of the background. These cells split the long responses into their component motifs to generate a big pandas dataframe with one row per unit/motif/trial.

In [None]:
class MotifSplitter:
    
    def __init__(self, resource_ids):
        self.stim_info = {}
        for result in nbank.describe_many(nbank.default_registry, *stim_names):
            metadata = result["metadata"]
            metadata["foreground"] = metadata["foreground"].split("-")
            self.stim_info[result["name"]] = pd.DataFrame(metadata)

    def __call__(self, resource_id: str) -> pd.DataFrame:
        return self.stim_info[resource_id]

    
stim_names = {trial["stimulus"]["name"] for trial in all_trials}
splitter = MotifSplitter(stim_names)

In [None]:
recording = []
for trial in tqdm(all_trials):
    trial_split = pprox.split_trial(trial, splitter)
    trial_split["unit"] = trial["unit"]
    recording.append(trial_split)
recording = (
    pd.concat(recording)
    .drop(columns=["foreground-dBFS", "background"])
    .rename(columns={"foreground": "stimulus"})
    .set_index(["background-dBFS","unit", "stimulus"]).sort_index()
)

## Load stimuli

Load the waveforms and compute spectrograms for the stimuli.

In [None]:
stim_names = recording.index.get_level_values("stimulus").unique()
example_stim = stim_names[0]
stimuli = []
for stim_name, stim_path in nbank.find_resources(*stim_names):
    with ewave.open(stim_path, "r") as fp:
        samples = ewave.rescale(fp.read(), "f")
        resampled = samplerate.resample(samples, 1.0 * desired_sampling_rate / fp.sampling_rate, "sinc_best")
        stimuli.append({"stimulus": stim_name, "samples": resampled, "sample_rate":  desired_sampling_rate})

stim_data = pd.DataFrame.from_records(stimuli).set_index("stimulus")

In [None]:
# compute gammatone spectrograms
from gammatone.gtgram import gtgram, gtgram_strides
from gammatone.filters import erb_space

def compute_spectrogram(row):
    duration = row.samples.size / row.sample_rate
    _, hop_samples, _ = gtgram_strides(row.sample_rate, spectrogram_params["window_time"], desired_time_step, row.samples.size)
    hop_time = hop_samples / row.sample_rate
    # this calculation is cached
    spectrogram = _mem.cache(gtgram)(row.samples, row.sample_rate, hop_time=desired_time_step, **spectrogram_params)
    _, nframes = spectrogram.shape
    spectrogram = np.log10(spectrogram + spectrogram_compression) - np.log10(spectrogram_compression)
    index = np.arange(0.0, duration, hop_time)[:nframes]
    columns = erb_space(spectrogram_params["f_min"], spectrogram_params["f_max"], spectrogram_params["channels"])[::-1]
    return pd.DataFrame(spectrogram.T, columns=columns, index=index).rename_axis(index="time", columns="frequency")

In [None]:
stims_processed = pd.concat({index: compute_spectrogram(row) for index, row in stim_data.iterrows()}, names=("stimulus", "time"))

#### Plot population responses

This is used to generate Figure 6A

In [None]:
# can color-code by cell type, not used
spike_type_colors = {"wide": "#70549B", "narrow": "#FF7F0E"}
feature_file = Path("..") / "build" / "mean_spike_features.csv"
features = pd.read_csv(feature_file)[["unit", "spike"]]
features["site"] = features.unit.apply(lambda s: "_".join(s.split("_")[:-1]))
features.set_index("unit", inplace=True)

In [None]:
spectrogram

In [None]:
n_motifs = 2
clean_recording = recording.loc[-100]
selected_motifs = stim_names[:n_motifs]
unit_names = recording.index.get_level_values("unit").unique()
n_units = len(unit_names)
fig, axes = plt.subplots(nrows=2, ncols=n_motifs, sharex=True, sharey="row", height_ratios=(400, n_units), figsize=(1.75 * n_motifs, 1.8), dpi=450)
for col, motif in zip(axes.T, selected_motifs):
    spectrogram = stims_processed.loc[motif].T
    extent=df_extent(spectrogram)
    extent=(extent[0], extent[1], 0, spectrogram.index.size)
    col[0].imshow(spectrogram, extent=extent, aspect="auto", origin="lower")
    tick_idx = [0, 19, 39]
    col[0].set_yticks(tick_idx, [f"{Y_test.columns[i] / 1000:.0f}" for i in tick_idx], size=4)
    #col[0].tick_params(axis='y', which='both', left=False, right=False, labelleft=True)
    col[0].get_xaxis().set_visible(False)
    col[0].set_frame_on(False)
    motif_trials = clean_recording.xs(motif, level="stimulus") #.join(features, on="unit", how="inner").sort_values(["site", "spike"])
    for i, trial in enumerate(motif_trials.itertuples()):
        if isinstance(trial.events, float):
            continue
        col[1].plot(
            trial.events,
            [i] * trial.events.size,
            color="k", # spike_type_colors[trial.spike],
            marker="|",
            markeredgewidth=0.1,
            markersize=0.15,
            linestyle="",
        )
    col[1].set_xlim(0, df_extent(spectrogram)[1] + decoder_window[1])
    col[1].tick_params(axis='both', which='major', labelsize=4)
    col[1].tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
    for boundary in ("left", "right","top"):
        col[1].spines[boundary].set_visible(False)

fig.subplots_adjust(hspace=0.1, wspace=0.1)

In [None]:
fig.savefig(f"../figures/{site_name}_population_responses.png")

## Pool and bin responses

We use trial-averaged firing rates - because we are pooling non-simultaneous recordings, there's not really anything we can learn from single-trial data, and it massively speeds things up to reduce the number of rows by a factor of 10.

In [None]:
def pool_spikes(x):
    try:
        return np.concatenate(x.dropna().values)
    except ValueError:
        return np.nan

def bin_responses(trials):
    stim = trials.name
    interval_end = trials.interval_end.iloc[0]
    stim_bins = stims_processed.loc[stim].index.to_numpy()
    time_step = stim_bins[1] - stim_bins[0]
    edges = np.concatenate([
        stim_bins,
        np.arange(stim_bins[-1], interval_end + time_step, time_step)[1:]
    ])
    rates = np.column_stack(trials.apply(lambda df: np.histogram(df.events, bins=edges)[0] / df.trials, axis=1))
    return pd.DataFrame(rates, index=pd.Index(edges[:-1], name="time"), columns=trials.index.get_level_values(0))
    

In [None]:
clean_rate_data = (
    clean_recording
    .groupby(["unit", "stimulus"])
    .agg(
        events=pd.NamedAgg(column="events", aggfunc=pool_spikes),
        trials=pd.NamedAgg(column="events", aggfunc=len),
        interval_end=pd.NamedAgg(column="interval_end", aggfunc="max")
    )
    .groupby("stimulus")
    .apply(bin_responses)
)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=n_motifs, sharex=True, sharey="row", height_ratios=(0.2, 0.4), figsize=(12, 4))
for col, motif in zip(axes.T, selected_motifs):
    spectrogram = stims_processed.loc[motif].T
    col[0].imshow(spectrogram, extent=df_extent(spectrogram), aspect="auto", origin="lower")
    col[0].set_yticks([1000, 4000, 8000], ["1", "4", "8"])
    col[0].get_xaxis().set_visible(False)
    neurogram = clean_rate_data.loc[motif].T
    col[1].imshow(neurogram, extent=df_extent(neurogram), origin="lower", aspect="auto", interpolation="nearest", cmap="Grays", vmin=0, vmax=1)
fig.subplots_adjust(hspace=0.1, wspace=0.1)

## Delay embedding

For a decoding model, the responses need to be delay-embedded so that the model can represent the relationship between the stimulus and the neural activity over a window of time around the model. A purely causal model only has positive lags. We use a raised-cosine basis set with bandwidths that increase with temporal distance, so that we have more precision for spikes that come immediately after the stimulus and less for things further out in the future.

In [None]:
def make_cosine_basis(n_tau: int, n_basis: int, linearity_factor: float=10) -> np.ndarray:
    """Make a nonlinearly stretched basis consisting of raised cosines

    n_tau:  number of time points
    n_basis:     number of basis vectors
    linearity_vactor:   offset for nonlinear stretching of x axis (larger values -> more linear spacing)
    """
    _min_offset = 1e-20    
    first_peak = np.log(linearity_factor + _min_offset)
    last_peak = np.log(n_tau * (1 - 1.5 / n_basis) + linearity_factor + _min_offset)
    peak_centers = np.linspace(first_peak, last_peak, n_basis)
    peak_spacing = (last_peak - first_peak) / (n_basis - 1)
    log_domain = np.log(np.arange(n_tau) + linearity_factor + _min_offset)
    basis = []
    for center in peak_centers:
        cos_input = np.clip((log_domain - center) * np.pi / peak_spacing / 2, -np.pi, np.pi)
        cos_basis = (np.cos(cos_input) + 1) / 2
        basis.append(cos_basis / np.linalg.norm(cos_basis))
    # TODO: return dataframe with labeled axes
    return np.column_stack(basis)

#plt.imshow(make_cosine_basis(60, 20, linearity_factor));

In [None]:
def delay_embed_trial(resp):
    trial = resp.name
    resp = resp.droplevel(0)
    stim_bins = stims_processed.loc[trial].index
    time_step = stim_bins[1] - stim_bins[0]
    lag_range = pd.Index(np.arange(decoder_window[0], decoder_window[1], time_step), name="lag")
    # this should be the same for all stims but it's easier to calculate here
    basis_matrix = make_cosine_basis(lag_range.size, n_basis, linearity_factor)
    def delay_embed_unit(unit):
        col = unit.loc[slice(stim_bins[0] - decoder_window[0], stim_bins[-1])]
        row = unit.loc[slice(stim_bins[-1], stim_bins[-1] + decoder_window[1])].iloc[:lag_range.size]
        lagged = hankel(col, row)
        return pd.DataFrame(np.dot(lagged, basis_matrix), index=col.index)
        #return pd.DataFrame(lagged, index=col.index, columns=lag_range)
    return pd.concat({unit_name: delay_embed_unit(resp[unit_name]) for unit_name in unit_names}, axis=1, names=("unit", "lag"))

In [None]:
clean_rates_embedded = clean_rate_data.groupby("stimulus").apply(delay_embed_trial)
# this is really important to ensure that all rows match in the two dataframes
clean_rates_embedded, clean_stims_processed = clean_rates_embedded.align(stims_processed, join='left', axis=0)

In [None]:
# this assertion should be true if the stimuli were not repeated
assert clean_rates_embedded.shape[0] == clean_stims_processed.shape[0], "dimensions of data don't match"
assert all(clean_rates_embedded.index == clean_stims_processed.index), "indices of data don't match"

#### Sanity check

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=n_motifs, sharex=True, sharey="row", height_ratios=(0.2, 0.4), figsize=(12, 4))
for col, motif in zip(axes.T, selected_motifs):
    spectrogram = stims_processed.loc[motif].T
    col[0].imshow(spectrogram, extent=df_extent(spectrogram), aspect="auto", origin="lower")
    col[0].set_yticks([1000, 4000, 8000], ["1", "4", "8"])
    col[0].get_xaxis().set_visible(False)
    neurogram = clean_rates_embedded.loc[motif].T
    col[1].imshow(neurogram, extent=df_extent(neurogram), origin="lower", aspect="auto")#, vmin=0, vmax=1)
fig.subplots_adjust(hspace=0.1, wspace=0.1)

## Fit decoder model

There are a variety of models that can be used for decoding; linear regression is by far the simplest. We need some regularization because k > n, so using ridge regression.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.model_selection import GridSearchCV

def compare_spectrograms_rmse(actual: np.ndarray, predicted: np.ndarray) -> float:
    esq = (actual - predicted)**2
    return np.sqrt(esq.sum())

def split_by_stimulus(X, Y):
    stimuli = X.index.get_level_values(0).unique()
    for stimulus in stimuli:
        yield (X.drop(stimulus), Y.drop(stimulus), X.loc[stimulus], Y.loc[stimulus])

In [None]:
alpha_candidates = np.logspace(-1, 7, 30)

ridge = Pipeline([('scaler', StandardScaler()), ('ridge', Ridge(fit_intercept=True))])
xval = GridSearchCV(ridge, cv=10, param_grid={"ridge__alpha": alpha_candidates}, n_jobs=2)
print(f"  -  Fitting Model: X shape is {clean_rates_embedded.shape}; Y shape is {clean_stims_processed.shape}")
xval.fit(clean_rates_embedded.values, clean_stims_processed.values);

In [None]:
cv_results = pd.DataFrame(xval.cv_results_)
# plt.plot(alpha_candidates, cv_results.mean_test_score, 'o')
plt.errorbar(np.log(alpha_candidates), cv_results.mean_test_score, yerr=cv_results.std_test_score, fmt="o")

In [None]:
best_alpha_idx = xval.best_index_
best_alpha = xval.best_params_["ridge__alpha"]
print(f"Best alpha: {best_alpha}; best score: {xval.best_score_}")

### Test decoding

Using the best alpha, iterate through the stimuli, holding each out as test data while fitting the model to the remaining stimuli.

Then iterate through the noise levels to generate predictions for responses to clean and noisy stimuli.

In [None]:
example_stim = stim_names[1]
X_train = clean_rates_embedded.drop(example_stim)
Y_train = clean_stims_processed.drop(example_stim)
X_test = clean_rates_embedded.loc[example_stim]
Y_test = clean_stims_processed.loc[example_stim]
fitted = ridge.set_params(ridge__alpha=best_alpha).fit(X_train.values, Y_train.values)
pred = fitted.predict(X_test)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(1.6,1.2), dpi=300)
extent = df_extent(Y_test.T)
axes[0].imshow(Y_test.T, extent=(extent[0], extent[1], 0, Y_test.columns.size), origin="lower", aspect="auto")
axes[0].set_title("Actual")
axes[1].imshow(pred.T, extent=(extent[0], extent[1], 0, Y_test.columns.size), origin="lower", aspect="auto")
axes[1].set_title("Decoded")
for ax in axes:
    ax.tick_params(axis='y', which='both', left=False, right=False, labelleft=True)
    ax.set_yticks([0, 19, 39])
    ax.set_yticklabels(f"{Y_test.columns[i] / 1000:.0f}" for i in (0, 19, 39))
fig.subplots_adjust(hspace=0.6, wspace=0.1)

In [None]:
fig.savefig(f"../figures/{site_name}_{example_stim}_decoded.pdf")

In [None]:
extent

In [None]:
fitted.score(X_test, Y_test)

### Noise invariance

To test for noise invariance, we use the fitted model to decode the responses to motifs embedded in colony noise, then compare to the prediction generated from the response to the clean stimulus. In essense, we're first projecting the response into the 40-dimensional stimulus space and then assessing how much it changes when we add noise. We haven't used the noisy-stimulus data yet so this will also have to be preprocessed.

In [None]:
noise_levels = recording.index.get_level_values("background-dBFS").unique().drop(-100)
noise_level = noise_levels[-1]
noise_recording = recording.loc[noise_level].xs(example_stim, level="stimulus", drop_level=False)

In [None]:
pred_invariance = []
fig, axes = plt.subplots(nrows=noise_levels.size + 1, ncols=1, sharex=True, figsize=(1.25,3.2), dpi=300)
axes[0].imshow(pred.T, extent=extent, origin="lower", aspect="auto")
axes[0].set_ylabel("70", rotation="horizontal")
axes[0].tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
for i, noise_level in enumerate(noise_levels, 1):
    noise_recording = recording.loc[noise_level].xs(example_stim, level="stimulus", drop_level=False)
    noise_rate_data = (
        noise_recording
        .groupby(["unit", "stimulus"])
        .agg(
            events=pd.NamedAgg(column="events", aggfunc=pool_spikes),
            trials=pd.NamedAgg(column="events", aggfunc=len),
            interval_end=pd.NamedAgg(column="interval_end", aggfunc="max")
        )
        .groupby("stimulus")
        .apply(bin_responses)
    )
    noise_rates_embedded = noise_rate_data.groupby("stimulus").apply(delay_embed_trial)
    # align will also pick out the matching stimulus
    noise_rates_embedded, noise_stims_processed = noise_rates_embedded.align(stims_processed, join='left', axis=0)
    # this assertion should be true if the stimuli were not repeated
    assert noise_rates_embedded.shape[0] == noise_stims_processed.shape[0], "dimensions of data don't match"
    assert all(noise_rates_embedded.index == noise_stims_processed.index), "indices of data don't match"

    score_actual = fitted.score(noise_rates_embedded, Y_test)
    score_pred_clean = fitted.score(noise_rates_embedded, pred)
    pred_noisy = fitted.predict(noise_rates_embedded)
    pred_invariance.append({
        "background-dBFS": noise_level,
        "score_actual": score_actual,
        "cor_actual": compare_spectrograms_cor(Y_test.values, pred_noisy),
        "score_pred_clean": score_pred_clean,
        "cor_pred_clean": compare_spectrograms_cor(pred, pred_noisy)
    })
    axes[i].imshow(pred_noisy.T, extent=extent, origin="lower", aspect="auto")
    axes[i].set_ylabel(f"{-30 - noise_level}", rotation="horizontal")
    axes[i].tick_params(axis='y', which='both', left=False, right=False, labelleft=False)
for ax in axes[:-1]:
    ax.get_xaxis().set_visible(False)
    ax.set_frame_on(False)
fig.subplots_adjust(hspace=0.1, wspace=0.1)

In [None]:
fig.savefig(f"../figures/{site_name}_{example_stim}_noise_invariance.pdf")

In [None]:
pd.DataFrame.from_dict(pred_invariance)

In [None]:
fitted.score(noise_rates_embedded, pred_noisy)

In [None]:
example_stim