In [None]:
%matplotlib widget
import numpy as np
import matplotlib.pyplot as plt
import flammkuchen as fl
from pathlib import Path

In [None]:
import pandas as pd

In [None]:
from scipy.stats import zscore

In [None]:
fps = 1.5
dt_imaging = 1/fps
data_root = Path(r"C:\Users\vilim\analysis\lsmlsda_data\whole_brain")
traces = fl.load(str(data_root / "traces_better_deconvolved.h5"))

In [None]:
normalized_traces = zscore(traces, axis=1)

Clear out traces with NaNs

In [None]:
sel = np.logical_not(np.any(np.isnan(normalized_traces), 1))

In [None]:
normalized_traces = normalized_traces[sel, :]

In [None]:
n_rois, n_t_imaging = traces.shape
t_imaging = np.arange(n_t_imaging)*dt_imaging

In [None]:
coords = fl.load(str(data_root / "coords.h5"))[sel, :]

## Plotting the traces

Normalize the data (so that each trace has a mean 0 and variance 1) and plot all traces together as a heatmap.

In [None]:
fig, ax = plt.subplots()
heatmap = ax.imshow(normalized_traces, aspect="auto", vmin=-2, vmax=2, cmap="RdBu_r", extent=[0, t_imaging[-1], 0, n_rois])
ax.set_xlabel("Time [s]")
ax.set_ylabel("ROI #")
fig.colorbar(heatmap)

# Regression

In [None]:
from scipy import signal
from scipy.interpolate import interp1d

In [None]:
# In this part we will correlate the individual traces (original traces, not the ones averaged over trials) with sensory and motor regressors.
# To do so, fist load the behavioural log and stimulus log
stimulus_log = fl.load(data_root / "stimulus_log.h5")
behavior_log = fl.load(data_root / "behavior_log.h5")

In [None]:
def upsample_double_decimate(t_orig, sig_orig, t_imaging, n_decimate=5):
    dt_imaging = t_imaging[1]-t_imaging[0]
    t_imaging_int = np.arange(len(t_imaging)*n_decimate**2)*dt_imaging/(n_decimate**2) # we decimate twice as the procedure works only for decimation amount > 13
    # (see https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.decimate.html)

    interpolated = interp1d(t_orig, sig_orig, bounds_error=False, fill_value=0)(t_imaging_int)

    return signal.decimate(signal.decimate(interpolated, n_decimate, ftype="iir"), n_decimate, ftype="iir")

## Creating the regressors
### Motor regressor
The motor regressor we sould like to have will be a general measure of the fish swimming power. Such regressor can be based on the standart deviation (SD) of the tail angle during the experiment. 
The behaviour of the fish was recorded and saved in the file "behavioural_log". In this DataFrame you will see the diffeent angles of the segments of the fish tail, as well as the variable "tail_sum". The motor regressor should be a moving SD of tail_sum. 

In [None]:
# Creating the motor regressor 

tail_sum = behavior_log['tail_sum'].values

dt_beh = np.mean(np.diff(behavior_log.t[100:200]))
vig_win = 0.05
n_vig = int(vig_win/dt_beh)
vigor = behavior_log.tail_sum.rolling(n_vig,  min_periods=1).std()

In [None]:
plt.figure()
plt.plot(behavior_log.t, vigor)
plt.xlabel("time [s]")
plt.ylabel("vigor [a.u.]")

### Sensory regressors
Creating two regressors for the stimulus (stimulus speed).
From the stimulu_log, get the variable "gain_kag_cl1D_vel". This is the velocity of the moving gratings. We will use this trace to create two regressors - one for positive velocity and one for negative velocity. Use the interpolation me

First, we resample the stimulation data so that it is equaly spaced in time, at 200 times the imaging frame rate (another method is the one demonstrated above for vigor)

In [None]:
velocity = upsample_double_decimate(stimulus_log.t, stimulus_log.gain_lag_cl1D_vel, t_imaging)

By the coordinate system convention, **negative velocity is forward** for the fish

In [None]:
plt.figure()
plt.plot(t_imaging, velocity)
plt.xlabel("t [s]")
plt.ylabel("v [mm/s]")

## Create the regressors

In [None]:
positive_vel = zscore(np.maximum(velocity, 0))
negative_vel = zscore(np.maximum(-velocity, 0))
vigor = zscore(upsample_double_decimate(behavior_log.t.values, np.nan_to_num(vigor), t_imaging))

### Correlating the traces with the regressors
At this point you will correlate each calcium trace with the three regressors.

In [None]:
regressors = np.column_stack([positive_vel, negative_vel, vigor])

In [None]:
regression_results = pd.DataFrame((normalized_traces @ regressors)/n_t_imaging, columns=["vel_pos", "vel_neg", "vigor"])

### Plot the best fitted neuron for each of the regressors

In [None]:
best_indices = np.nanargmax(regression_results.values, axis=0)

In [None]:
fig, axes = plt.subplots(3, 1, sharex=True)
for i_reg, (ax, name) in enumerate(zip(axes, regression_results.columns)):
    ax.plot(t_imaging, normalized_traces[best_indices[i_reg], :], label="flourscence")
    ax.plot(t_imaging, regressors[:, i_reg], label="regressor")
    ax.set_ylabel(name)
axes[0].legend()
axes[2].set_xlabel("t [s]")

## Show regression coefficients in brain coordinates

In [None]:
regression_results = pd.concat([regression_results,
                               pd.DataFrame(coords, columns=["z", "y", "x"])], axis=1)

In [None]:
fig, axes = plt.subplots(1, 3, sharex=True, sharey=True)
for i_reg, (ax, name) in enumerate(zip(axes, regression_results.columns)):
    ax.scatter(regression_results.x, regression_results.y, c=regression_results[name], s=0.2, vmin=-1, vmax=1, cmap="RdBu_r")
    ax.set_title(name)
    ax.set_aspect(1)

Forward velocity and vigur are very correlated, which is why their maps look almost the same. Doing a proper multi-linear regression with sparsity constraints might allow us to tease apart the two cases

## Average trials

Create trial-averaged traces. Each trial is 180 seconds. This will show a cleaner stimulus-related response

In [None]:
n_trials = 9
trial_duration = 180.0

In [None]:
n_t_trial = n_t_imaging // n_trials

grouped_traces = np.reshape(normalized_traces, (-1, n_trials, n_t_trial))

In [None]:
traces_per_trial = np.mean(grouped_traces, axis=1)

In [None]:
t_imaging_trial = np.arange(traces_per_trial.shape[1])*dt_imaging

In [None]:
fig, ax = plt.subplots()
heatmap = ax.imshow(traces_per_trial, aspect="auto", vmin=-2, vmax=2, cmap="RdBu_r", extent=[0, t_imaging_trial[-1], 0, n_rois])
ax.set_xlabel("Time [s]")
ax.set_ylabel("ROI #")
fig.colorbar(heatmap)

# Dimensionality reduction and clustering

Extract principal components of the average response.components?

In [None]:
from sklearn.decomposition import PCA

Run the PCA

In [None]:
pca = PCA(100)
loadings = pca.fit_transform(traces_per_trial)

## Plot the first 3 PCs

In [None]:
plt.figure()
plt.plot(pca.components_[:3, :].T)

In [None]:
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))

Plot the variance explained by each component and try to establish how many components you need to explain everything that is not noise. Extra credit: do cross validated PCA (fit the PCs on average traces of some trials, and check how many components you need to explain other trials)

Can you interpret the principal components in terms of stimulus?

## PC trajectories

Plot the neural activity of the whole brain as a phase-space plot (extra credit: encode time or stimulus value in the color)

In [None]:
plt.figure()
plt.plot(pca.components_[0, :], pca.components_[1, :], color="gray", lw=0.5)
plt.scatter(pca.components_[0, :], pca.components_[1, :], c=velocity[:len(t_imaging_trial)])
plt.xlabel("PC 1")
plt.ylabel("PC 2")


## Clustering

Use K means clustering to classify neurons by principal component loading (using all components that are not noise)

In [None]:
from sklearn.cluster import KMeans

Plot the neurons in the space of principal component loading coefficients (for PC1 and PC2) and color them by cluster

In [None]:
clust_ids = KMeans(3).fit_predict(loadings)

In [None]:
plt.figure()
plt.scatter(loadings[:,0], loadings[:, 2], c=clust_ids)

Are the clusters showing discrete response classes? What are the assumptions of K-Means and does this datasat satisfy it?

## Clusters in anatomical space

(in the readme now there is a link to the coords file)

In [None]:
dx = 0.6
dy = 0.6
dz = 7.0

In [None]:
fig, ax = plt.subplots(2, 2)
ax[0,0].scatter(coords[:,1]*dx, coords[:,2]*dy, s=0.1, c=clust_ids)
ax[0,0].set_aspect(1)
ax[0,0].set_xticklabels('')

ax[1,0].scatter(coords[:,1]*dx, coords[:,0]*dz, s=0.1, c=clust_ids)
ax[1,0].set_aspect(1)

ax[0,1].scatter(coords[:,0]*dz, coords[:,2]*dy, s=0.1, c=clust_ids)
ax[0,1].set_aspect(1)
ax[0,1].set_yticklabels('')
ax[1,1].axis("off")

Now, color the cells according to principal component loading or cluster assignement

# Decode the velcity from the traces

Split the velocity and the traces into a traning and test set. Choose carefully so that most conditions are well represented

In [None]:
np.random.seed(1234)
all_trials = np.random.permutation(n_trials)
n_test = 2

trials_train = all_trials[n_test:]
trials_test = all_trials[:n_test]

traces_train, traces_test = (np.concatenate([grouped_traces[:, i_trial, :]
                                             for i_trial in sel_trials], axis=1)
                             for sel_trials in [trials_train, trials_test])


vel_train, vel_test = (np.concatenate([velocity[i_trial*n_t_trial:(i_trial+1)*n_t_trial]
                                             for i_trial in sel_trials], axis=0)
                             for sel_trials in [trials_train, trials_test])

Use methods from scikit-learn, starting with sklearn.linear_model.LinearRegression (or write your own linear regression!), use the fit and predict methods to decode velocity

In [None]:
from sklearn.linear_model import RidgeCV

E.g. for a linear model:
    $$v(t) = \Sigma_{i}^{n\_neurons}w_i a_i(t)$$

We have a lot of features (>8000!), so we need to prevent overfitting by penalizing large weights. Scikit-learn provides a cross-validated version of sum-of-square weights penalized linear regression, `RidgeCV`

In [None]:
decoder = RidgeCV(alphas=[1.0, 10.0, 100.0,1000.0, 10000.0])

In [None]:
decoder.fit(traces_train.T, vel_train)

In [None]:
decoder.alpha_

In [None]:
vel_test_pred = decoder.predict(traces_test.T)

Plot the decoded velocity vs the real velocity, in time and as a scatter plot. Which regions of the stimulus space are decoded best?

In [None]:
plt.figure()
plt.plot(vel_test_pred, label="prediction")
plt.plot(vel_test, label="true velocity")
plt.legend()

## Extra credit 
* try to determine how many cells you need to decode the velocity. Which cells are the most important ones, if there are such?
* do nonlinear decoding methods (e.g. neural networks, also available with the same interface in scikit-learn) improve the decoding?
* try to decode behavior