In [1]:
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import kachery as ka
import os
from utils import prepare_dataset_from_hash
import numpy as np
import spikesorters as ss

# Default TMP directory may have reach storage capacity whilst running spike sorters. Specify your own tmp dir here.
tmp_path = Path('/home/mclancy/SpikeConfidence/.tmp/')
os.environ["TMP"] = str(tmp_path)
os.environ["TMPDIR"] = str(tmp_path)
os.environ["TEMPDIR"] = str(tmp_path)
os.environ["ML_TEMPORARY_DIRECTORY"] = str(tmp_path)
tmp_path.mkdir(exist_ok=True, parents=True)


# Need to create this folder before use.
kache_path = Path("/data/.kache")
os.environ["KACHERY_STORAGE_DIR"] = str(kache_path)
kache_path.mkdir(exist_ok=True, parents=True)

# Configure kachery to download data from the public database
ka.set_config(fr='default_readonly')

base_dir = Path('/home/mclancy/SpikeConfidence/')
spike_sorter_dir = base_dir / 'spikesorters'

# Specify the path to the non python sorters.
ss.Kilosort2_5Sorter.set_kilosort2_5_path(spike_sorter_dir / 'Kilosort')
ss.IronClustSorter.set_ironclust_path(spike_sorter_dir / 'ironclust')

# This is the cache currently holding the sortings and metrics for recordings.
cache_path = Path('/data/.cache')

  _pull_git_repo(path=config_repo_path + '/repo')
  _pull_git_repo(path=config_repo_path + '/repo')


Setting KILOSORT2_5_PATH environment variable for subprocess calls to: /home/mclancy/SpikeConfidence/spikesorters/Kilosort
Setting IRONCLUST_PATH environment variable for subprocess calls to: /home/mclancy/SpikeConfidence/spikesorters/ironclust


In [7]:
# All information required for performing a regression held here:
#   Recording path, ground truth path, sorters to run, and metrics to calculate.

sorter_names = ['mountainsort4', 'herdingspikes', 'spykingcircus', 'ironclust','tridesclous']

metric_names = np.array(["num_spikes", "firing_rate", "presence_ratio",
                "isi_violation", "amplitude_cutoff", "snr",
                "max_drift", "cumulative_drift", "silhouette_score",
                "isolation_distance", "l_ratio",
                "nn_hit_rate", "nn_miss_rate","d_prime"])

In [3]:
static_siprobe1_recording_path = 'sha1dir://615aa23efde8898aa89002613e20ad59dcde42f9.hybrid_janelia/static_siprobe/rec_16c_1200s_11'
static_siprobe1_gt_path = 'sha1dir://615aa23efde8898aa89002613e20ad59dcde42f9.hybrid_janelia/static_siprobe/rec_16c_1200s_11/firings_true.mda'

static_siprobe2_recording_path = 'sha1dir://615aa23efde8898aa89002613e20ad59dcde42f9.hybrid_janelia/static_siprobe/rec_16c_1200s_21'
static_siprobe2_gt_path = 'sha1dir://615aa23efde8898aa89002613e20ad59dcde42f9.hybrid_janelia/static_siprobe/rec_16c_1200s_21/firings_true.mda'

static_siprobe3_recording_path = 'sha1dir://615aa23efde8898aa89002613e20ad59dcde42f9.hybrid_janelia/static_siprobe/rec_16c_1200s_31'
static_siprobe3_gt_path = 'sha1dir://615aa23efde8898aa89002613e20ad59dcde42f9.hybrid_janelia/static_siprobe/rec_16c_1200s_31/firings_true.mda'

recording_paths = [static_siprobe1_recording_path, static_siprobe2_recording_path, static_siprobe3_recording_path]
gt_paths = [static_siprobe1_gt_path, static_siprobe2_gt_path, static_siprobe3_gt_path]

Could not build regressor for tridesclous with data provided; Likely that there are no instances of false positives from this sorter for these data This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0
{'mountainsort4': 0.8823529411764706, 'herdingspikes': 0.9230769230769231, 'spykingcircus': 1.0, 'ironclust': 1.0, 'tridesclous': None}


In [11]:
# How do sorter specific regressions perform on unseen data from the same dataset they were trained on?

try:
    accuracies = np.load('results/single_sorter_accuracies.npy', allow_pickle=True).item()
except FileNotFoundError:
    accuracies = {}
    for sorter_name in sorter_names:
        X, y = prepare_dataset_from_hash(recording_paths=recording_paths, gt_paths=gt_paths, metric_names=metric_names, sorter_names=[sorter_name], cache_path=cache_path)

        # Shuffled and split into train/test sets.
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

        # False positive classification via logistic regression
        model = make_pipeline(StandardScaler(), LogisticRegression())

        try:
            model.fit(X_train, y_train)
            model_accuracy = model.score(X_test, y_test)
            accuracies[sorter_name] = model_accuracy
        except ValueError as e:
            accuracies[sorter_name] = None
            print(f"Could not build regressor for {sorter_name} with data provided; Likely that there are no instances of false positives from this sorter for these data", e)

    np.save('results/single_sorter_accuracies.npy', accuracies)

print(accuracies)



Could not build regressor for tridesclous with data provided; Likely that there are no instances of false positives from this sorter for these data This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0
{'mountainsort4': 0.8823529411764706, 'herdingspikes': 0.9230769230769231, 'spykingcircus': 1.0, 'ironclust': 1.0, 'tridesclous': None}


There are not enough false positives in a given sorter results for a classifier to be built for each sorter. Must either use more datasets, or merge the sorter classifier into one general classifier:

In [10]:
recording_paths = [static_siprobe1_recording_path, static_siprobe2_recording_path, static_siprobe3_recording_path]
gt_paths = [static_siprobe1_gt_path, static_siprobe2_gt_path, static_siprobe3_gt_path]

try:
    accuracy = np.load('results/general_sorter_accuracy.npy')
except FileNotFoundError:
    X, y = prepare_dataset_from_hash(recording_paths=recording_paths, gt_paths=gt_paths, metric_names=metric_names, sorter_names=sorter_names, cache_path=cache_path)

    # Shuffled and split into train/test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

    # False positive classification via logistic regression
    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    accuracy = model.score(X_test, y_test)
    np.save('results/general_sorter_accuracy.npy', accuracy)

print(accuracy)

0.88


  and should_run_async(code)
