# Hyperparameter searches

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json
import pickle

import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from torch.nn import MSELoss
from torch.nn.functional import cross_entropy

from thesis_project.settings import EXPERIMENT1_DIR, EXPERIMENT2_DIR, RESULT_DIR
from thesis_project.data_loading import (
    construct_spikerates_filename,
    load_session_ids,
    load_spike_data,
)
from thesis_project.models.rnn_encoder_only import RNNEncoderOnly
from thesis_project.models.transformer_encoder_decoder import TransformerEncoderDecoder
from thesis_project.models.transformer_encoder_only import TransformerEncoderOnly
from thesis_project.parameter_optimization.nn_optimization import NNOptimization
from thesis_project.parameter_optimization.svm_optimization import SVMOptimization
from thesis_project.preprocessing.tokenization import SingleWordTokenizer
from thesis_project.training.metrics import get_classification_metrics
from thesis_project.training.metrics import get_regression_metrics
from thesis_project.models import OutputType
from thesis_project.training.metrics import get_sequence_classification_metrics
from thesis_project.preprocessing.label_preparation import prepare_spikerates_for_session


# Experiment 1

## Train / test split
Generate the train / test split to be used in experiments. \
We create 5 stratified folds but only train on one split.

In [None]:
def create_train_test_split(
    k_folds: int = 5, shuffle: bool = True, random_state: int = None
) -> dict:

    train_test_split = {}

    session_ids = load_session_ids(data_dir=EXPERIMENT1_DIR)

    for session_id in session_ids:

        labels_words, _, _, _ = load_spike_data(
            f"{EXPERIMENT1_DIR}/{session_id}_naming"
        )

        kfold = StratifiedKFold(
            n_splits=k_folds, shuffle=shuffle, random_state=random_state
        )

        split = kfold.split(np.zeros(len(labels_words)), y=labels_words)

        session_ids = [
            {"train_ids": train_ids.tolist(), "test_ids": test_ids.tolist()}
            for train_ids, test_ids in split
        ]
        train_test_split[session_id] = session_ids

    return train_test_split

In [None]:
k_folds = 5
shuffle = True
random_state = None

train_test_path = f"{EXPERIMENT1_DIR}/train_test_split/train_test_split.json"

In [None]:
# # uncomment this code to create a new train_test_split

# train_test_split = create_train_test_split()

# with open(train_test_path, "w") as file:
#     file.write(json.dumps(train_test_split, indent=4))

In [None]:
# load train_test_split

with open(train_test_path, "r") as file:
    train_test_split = json.loads(file.read())

In [None]:
current_fold_idx = 0
current_fold_split = {
    session_id: folds[current_fold_idx]
    for session_id, folds in train_test_split.items()
}

In [None]:
session_ids = list(current_fold_split.keys())
limit_to_ids = {
    session_id: train_test_ids["train_ids"]
    for session_id, train_test_ids in current_fold_split.items()
}

In [None]:
# combine train test split for simultaneous training on all session_ids

combined_train_test_idx = []

current_len = 0

session_ids = list(current_fold_split.keys())
for session_id in session_ids:

    current_train_idx = current_fold_split[session_id]["train_ids"]
    
    combined_train_test_idx.extend([idx + current_len for idx in current_train_idx])
    spikerates = prepare_spikerates_for_session(
                    session_id=session_id,
                    path=EXPERIMENT1_DIR,
                    bin_size=50,
                    blur_sd=None,
                    experiment="experiment1",
                )
    
    current_len += len(spikerates)

## Classification

### SVM

In [None]:
# random search
svm_optimization_parameters = {
    "model_name": "svm",
    "task_name": "clf",
    "experiment": "experiment1",
    "session_ids": session_ids,
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],  # , "flatten"],
    "label_names": ["syncat_labels", "semcat_labels", "labels_words"],
    "binning_params": [{"bin_size": 50, "blur_sd": None},
       {"bin_size": 50, "blur_sd": 2},
       {"bin_size": 20, "blur_sd": None},
       {"bin_size": 20, "blur_sd": 2}],
    "n_folds": 5,
    "n_repeats": 5,
    "search_params": {
        "kernel": {"choice": ["linear", "rbf", "sigmoid"]},
        "C": {"exp": [10, -1, 3]},
        "gamma": {"choice": [0.1, 0.01, 0.001, 0.0001, "scale", 'auto'
                             ]}
    },
    "optimization_type": "random",
    "data_dir": f"{EXPERIMENT1_DIR}",
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp1/fold_1",
    "random_seed": None,
    "n_random_runs": 50,
    "metric_dict": get_classification_metrics()
}

In [None]:
%%timeit -n 1 -r 1
svm_optimization = SVMOptimization(**svm_optimization_parameters)
svm_optimization.run(output_name="svm_clf")

### RNN

In [None]:
rnn_optimization_parameters = {
    "model_name": "rnn",
    "task_name": "clf",
    "session_ids": session_ids,
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],
    "label_names": ["syncat_labels", "semcat_labels", "labels_words"],
    "binning_params": [{"bin_size": 50, "blur_sd": None},
                        {"bin_size": 50, "blur_sd": 2},
                        {"bin_size": 20, "blur_sd": None},
                        {"bin_size": 20, "blur_sd": 2}],

    "n_folds": 5,
    "n_repeats": 1,
    "search_params": {"learning_rate": ("exp", [-1, -5, 10]),
                      "weight_decay": ("choice", [0, 1e-1, 1e-2, 1e-3]),
                      "batch_size": ("fixed", 128)},
    "optimization_type": "random",
    "data_dir": EXPERIMENT1_DIR,
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp1/fold_1",
    "fixed_cv_params": {
        "loss_function": cross_entropy,
        "device_name": "cuda",
        "num_epochs": 100,
    },
    "model_params": {"device": ("fixed", "cuda"),
                     "hidden_size": ("choice", [32, 64, 128, 256]),
                      "dropout": ("uniform", (0.1, 0.8)),
                "n_layers": ("choice", [1, 2, 3])},
    "random_seed": None,
    "n_random_runs": 20,
    "metric_dict": get_classification_metrics(),
}

In [None]:
nn_optimization = NNOptimization(**rnn_optimization_parameters)
results = nn_optimization.run(output_name="rnn_clf")

### Transformer

In [None]:
transformer_optimization_parameters = {
    "model_name": "trf",
    "task_name": "clf",
    "session_ids": session_ids,
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],
    "label_names": ["syncat_labels", "semcat_labels", "labels_words"],
    "binning_params": [{"bin_size": 50, "blur_sd": None},
                        {"bin_size": 50, "blur_sd": 2},
                        {"bin_size": 20, "blur_sd": None},
                        {"bin_size": 20, "blur_sd": 2}],
    "n_folds": 5,
    "n_repeats": 1,
    "search_params": {"learning_rate": ("exp", [-1, -5, 10]),
                      "weight_decay": ("choice", [0, 1e-1, 1e-2, 1e-3]),
                      "batch_size": ("fixed", 64)},
    "optimization_type": "random",
    "data_dir": EXPERIMENT1_DIR,
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp1/fold_1",
        "fixed_cv_params": {
        "loss_function": cross_entropy,
        "device_name": "cuda",
        "num_epochs": 100,
    },
    "model_params": {"device": ("fixed", "cuda"),
                     "hidden_size": ("choice", [32, 64, 128, 256]),
                      "dropout": ("uniform", (0.1, 0.8)),
                      "n_layers": ("choice", [1, 2, 3])},
    "random_seed": None,
    "n_random_runs": 10,
    "metric_dict": get_classification_metrics(),
}

In [None]:
nn_optimization = NNOptimization(**transformer_optimization_parameters)
results = nn_optimization.run(output_name="trf_clf")

## Regression

### SVR

In [None]:
def get_svr_optimization_params():
    return {
        "model_name": "svm",
        "task_name": "reg",
        "session_ids":session_ids,
        "limit_to_ids": limit_to_ids,
        "preprocessing_methods": ["mean_sequence"],
        "label_names": ["labels_words"],
        "binning_params": [
        {"bin_size": 50, "blur_sd": None},
        {"bin_size": 50, "blur_sd": 2},
        {"bin_size": 20, "blur_sd": None},
        {"bin_size": 20, "blur_sd": 2}],
        "n_folds": 5,
        "n_repeats": 1,
        "search_params": {
            "estimator__kernel": {"choice": ["linear", "rbf", "sigmoid"]},
            "estimator__C": {"exp": [10, -1, 3]},
            "estimator__gamma": {"choice": [0.1, 0.01, 0.001, 0.0001, "scale", 'auto']},
        },
        "optimization_type": "random",
        "data_dir": EXPERIMENT1_DIR,
        "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp1/fold_1",
        "random_seed": None,
        "n_random_runs": 30,
        "output_type": OutputType.REGRESSION,
        "embedding": "glove-twitter-25",
        "metric_dict": get_regression_metrics(),
    }

In [None]:
svr_optimization = SVMOptimization(**get_svr_optimization_params())
results = svr_optimization.run(output_name="svm_reg")

### RNN

In [None]:
rnn_regression_optimization_parameters = {
    "model_name": "rnn",
    "task_name": "reg",
    "session_ids": session_ids,
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],
    "label_names": ["labels_words"],
    "binning_params": [{"bin_size": 50, "blur_sd": None},
                        {"bin_size": 50, "blur_sd": 2},
                        {"bin_size": 20, "blur_sd": None},
                        {"bin_size": 20, "blur_sd": 2}],
    "n_folds": 5,
    "n_repeats": 1,
    "search_params": {"learning_rate": ("exp", [-1, -5, 10]),
                      "weight_decay": ("choice", [0, 1e-1, 1e-2, 1e-3]),
                      "batch_size": ("fixed", 128)},
    "optimization_type": "random",
    "data_dir": EXPERIMENT1_DIR,
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp1/fold_1",
    "fixed_cv_params": {
        "loss_function": MSELoss(),
        "device_name": "cuda",
        "num_epochs": 100
    },
    "model_params": {"device": ("fixed", "cuda"),
                     "hidden_size": ("choice", [32, 64, 128, 256]),
                      "dropout": ("uniform", (0.1, 0.8)),
                "n_layers": ("choice", [1, 2, 3])},
    "random_seed": None,
    "output_type": OutputType.REGRESSION,
    "embedding": "glove-twitter-25",
    "n_random_runs": 50,
    "metric_dict": get_regression_metrics()
}

In [None]:
nn_optimization = NNOptimization(**rnn_regression_optimization_parameters)
results = nn_optimization.run(output_name="rnn_reg")

### Transformer

In [None]:
transformer_regression_optimization_parameters = {
    "model_name": "trf",
    "task_name": "reg",
    "session_ids": session_ids,
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],
    "label_names": ["labels_words"],
 "binning_params": [{"bin_size": 50, "blur_sd": None},
                        {"bin_size": 50, "blur_sd": 2},
                        {"bin_size": 20, "blur_sd": None},
                        {"bin_size": 20, "blur_sd": 2}],
    "n_folds": 5,
    "n_repeats": 1,
    "search_params": {"learning_rate": ("exp", [-1, -5, 10]),
                      "weight_decay": ("choice", [0, 1e-1, 1e-2, 1e-3]),
                      "batch_size": ("fixed", 64)},
    "optimization_type": "random",
    "data_dir": EXPERIMENT1_DIR,
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp1/fold_1",
    "fixed_cv_params": {
        "loss_function": MSELoss(),
        "device_name": "cuda",
        "num_epochs": 100
        },
    "model_params": {"device": ("fixed", "cuda"),
                     "hidden_size": ("choice", [32, 64, 128, 256]),
                      "dropout": ("uniform", (0.1, 0.8)),
                      "n_layers": ("choice", [1, 2, 3])},
    "random_seed": None,
    "n_random_runs": 5,
    "output_type": OutputType.REGRESSION,
    "embedding": "glove-twitter-25",
    "metric_dict": get_regression_metrics()

}

In [None]:
nn_optimization = NNOptimization(**transformer_regression_optimization_parameters)
results = nn_optimization.run(output_name="trf_reg")

# Experiment 2

## Train / test split

In [None]:
spikerates_path = construct_spikerates_filename(
    session_id="20240708",
    path=f"{EXPERIMENT2_DIR}/binned_spikerates",
    bin_size=100,
    experiment="experiment2",
)

sentences_path = f"{EXPERIMENT2_DIR}/sentences_new.pkl"

with open(spikerates_path, "rb") as file:
    spikerates = np.load(file, allow_pickle=True)

with open(sentences_path, "rb") as file:
    sentences = pickle.load(file)

In [None]:
n_folds = 5
shuffle = True
random_state = None


kfold = KFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state)

split = kfold.split(np.zeros(len(sentences)), y=sentences)
train_test_split = [
    {"train_ids": train_ids.tolist(), "test_ids": test_ids.tolist()}
    for train_ids, test_ids in split
]

In [None]:
train_test_path = f"{EXPERIMENT2_DIR}/train_test_split/train_test_split.json"

In [None]:
# # uncomment this code to create a new train_test_split

# with open(train_test_path, "w") as file:
#     file.write(json.dumps(train_test_split, indent=4))

In [None]:
# load train_test_split

with open(train_test_path, "r") as file:
    train_test_split = json.loads(file.read())

In [None]:
limit_to_ids = {"20240708": train_test_split[0]["train_ids"]}

## Tokenization

In [None]:
tokenizer = SingleWordTokenizer()
tokenizer_file_path = f"{EXPERIMENT2_DIR}/single_word_token_dict.json"

# uncomment the following code to create a new tokenizer
# the tokenization should be deterministic, storing it in a file for convenience
# tokenizer.token_dict_from_samples(sentences)
# tokenizer.token_dict_to_file(tokenizer_file_path)

tokenizer.token_dict_from_file(tokenizer_file_path)

## Classification

### RNN

In [None]:
rnn_sequence_classification_optimization_parameters = {
    "model_name": "rnn",
    "task_name": "seq_clf",
    "experiment": "experiment2",
    "session_ids": ["20240708"],
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],
    "label_names": ["sentences"],
    "binning_params": [{"bin_size": 100, "blur_sd": None},
                       {"bin_size": 100, "blur_sd": 2}],
    "n_folds": 5,
    "n_repeats": 1,
    "search_params": {"learning_rate": ("exp", [-1, -5, 10]),
                      "weight_decay": ("choice", [0, 1e-1, 1e-2, 1e-3]),
                      "batch_size": ("fixed", 128)},
    "optimization_type": "random",
    "data_dir": f"{EXPERIMENT2_DIR}/binned_spikerates",
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp2/fold_1",
    "fixed_cv_params": {
        "loss_function": cross_entropy,
        "device_name": "cuda",
        "num_epochs": 100
    },
    "model_params": {"n_labels": ("choice", [tokenizer.n_labels]),
                     "encoder_n_layers": ("choice", [1, 2, 3]),
                     "decoder_n_layers": ("choice", [1]),
                     "encoder_hidden_size": ("choice", [32, 64, 128]),
                     "decoder_hidden_size": ("choice", [None]),
                     "encoder_dropout": ("uniform", (0.1, 0.8)),
                     "decoder_dropout": ("uniform", (0.1, 0.8)),
                     "device": ("choice", ["cuda"]),
},
    "random_seed": None,
    "n_random_runs": 20,
    "output_type": OutputType.CLASSIFICATION,
    "metric_dict": get_sequence_classification_metrics()
}

In [None]:
nn_optimization = NNOptimization(**rnn_sequence_classification_optimization_parameters)
results = nn_optimization.run(output_name="rnn_seq_clf")

### Transformer

In [None]:
transformer_sequence_classification_optimization_parameters = {
    "model_name": "trf",
    "task_name": "seq_clf",
    "experiment": "experiment2",
    "session_ids": ["20240708"],
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],
    "label_names": ["sentences"],
    "binning_params": [{"bin_size": 100, "blur_sd": None},
                       {"bin_size": 100, "blur_sd": 2}],
    "n_folds": 5,
    "n_repeats": 1,
    "search_params": {"learning_rate": ("exp", [-1, -5, 10]),
                      "weight_decay": ("choice", [0, 1e-1, 1e-2, 1e-3]),
                      "batch_size": ("fixed", 64)},
    "optimization_type": "random",
    "data_dir": f"{EXPERIMENT2_DIR}/binned_spikerates",
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp2/fold_1",
    "fixed_cv_params": {
        "loss_function": cross_entropy,
        "device_name": "cuda",
        "num_epochs": 100
    },
    "model_params": {#"n_labels": ("choice", [tokenizer.n_labels]),
                     "encoder_n_layers": ("choice", [1, 2, 3]),
                     "decoder_n_layers": ("choice", [1]),
                     "hidden_size": ("choice", [32, 64, 128]),
                     "dropout": ("uniform", (0.1, 0.8)),
                     "device": ("choice", ["cuda"]),
},
    "random_seed": None,
    "n_random_runs": 20,
    "output_type": OutputType.CLASSIFICATION,
    "metric_dict": get_sequence_classification_metrics()
}

In [None]:
nn_optimization = NNOptimization(
            **transformer_sequence_classification_optimization_parameters
        )
results = nn_optimization.run(output_name="trf_seq_clf")

## Regression

### RNN

In [None]:
rnn_sequence_regression_optimization_parameters = {
    "model_name": "rnn",
    "task_name": "seq_reg",
    "experiment": "experiment2",
    "session_ids": ["20240708"],
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],
    "label_names": ["sentences"],
    "binning_params": [{"bin_size": 100, "blur_sd": None},
                       {"bin_size": 100, "blur_sd": 2}],
    "n_folds": 5,
    "n_repeats": 1,
    "search_params": {"learning_rate": ("exp", [-1, -5, 10]),
                      "weight_decay": ("choice", [0, 1e-1, 1e-2, 1e-3]),
                      "batch_size": ("fixed", 64)},
    "optimization_type": "random",
    "data_dir": f"{EXPERIMENT2_DIR}/binned_spikerates",
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp2/fold_1",
    "fixed_cv_params": {
        "loss_function": MSELoss(),
        "device_name": "cuda",
        "num_epochs": 1
    },
    "model_params": {"encoder_n_layers": ("choice", [3]),
                     "decoder_n_layers": ("choice", [1]),
                     "encoder_hidden_size": ("choice", [128]),
                     "decoder_hidden_size": ("choice", [None]),
                     "encoder_dropout": ("uniform", (0.1, 0.8)),
                     "decoder_dropout": ("uniform", (0.1, 0.8)),
                     "device": ("choice", ["cuda"])},
    "random_seed": None,
    "output_type": OutputType.REGRESSION,
    "embedding": "glove-twitter-25",
    "random_seed": None,
    "n_random_runs": 1,
    "metric_dict": {}
}

In [None]:
nn_optimization = NNOptimization(**rnn_sequence_regression_optimization_parameters)
results = nn_optimization.run(output_name="rnn_seq_reg")

### Transformer

In [None]:
transformer_sequence_regression_optimization_parameters = {
    "model_name": "trf",
    "task_name": "seq_reg",
    "experiment": "experiment2",
    "session_ids": ["20240708"],
    "limit_to_ids": limit_to_ids,
    "preprocessing_methods": ["mean_sequence"],
    "label_names": ["sentences"],
    "binning_params": [{"bin_size": 100, "blur_sd": None},
                       {"bin_size": 100, "blur_sd": 2}],
    "n_folds": 5,
    "n_repeats": 1,
    "search_params": {"learning_rate": ("exp", [-1, -5, 10]),
                      "weight_decay": ("choice", [0, 1e-1, 1e-2, 1e-3]),
                      "batch_size": ("choice", [64])},
    "optimization_type": "random",
    "data_dir": f"{EXPERIMENT2_DIR}/binned_spikerates",
    "output_dir": f"{RESULT_DIR}/final_results/hyperparameter_optimization/exp2/fold_1",
    "fixed_cv_params": {
        "loss_function": MSELoss(),
        "device_name": "cuda",
        "num_epochs": 1,
    },
    "model_params": {"encoder_n_layers": ("choice", [1, 2]),
                     "decoder_n_layers": ("choice", [1]),
                     "hidden_size": ("choice", [32, 64, 128]),
                     "dropout": ("uniform", (0.1, 0.8)),
                     "device": ("fixed", "cuda")},
    "random_seed": None,
    "output_type": OutputType.REGRESSION,
    "embedding": "glove-twitter-25",
    "n_random_runs": 1,
    "metric_dict": {}
}

In [None]:
# %%timeit -n 1 -r 1
nn_optimization = NNOptimization(
    **transformer_sequence_regression_optimization_parameters
)
results = nn_optimization.run(output_name="trf_seq_reg")