In [54]:
%pip install scikit-learn optuna matplotlib seaborn pandas tables tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
[0mSuccessfully installed tabulate-0.9.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [55]:
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import matthews_corrcoef
import optuna
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import tabulate

# Data Load

In [6]:
from glob import glob
df = []
for path in glob("data/ID18150/Day2/DataFrame_Imaging_dFF_18150_day2_part_*.h5"):
    df.append(pd.read_hdf(path))
df = pd.concat(df)
# Get a list of columns whose names are of numerical type
numerical_columns = [col for col in df.columns if type(col) == int]

#Feature matrix
X = df[numerical_columns].values
t = df.Time.to_list()
y = df['Pump'].astype(int)
n_features = len(numerical_columns)

# Modelling

In [52]:
n_folds = 10
def objective(trial):
    kf = KFold(n_splits=n_folds)
    mcc_acc = []
    for split in kf.split(X):
        X_train, X_test = X[split[0]], X[split[1]]
        y_train, y_test = y[split[0]], y[split[1]]
        pca = PCA(n_components=trial.suggest_int('n_components', int(0.05*n_features), int(0.5*n_features)))
        pca.fit(X_train)
        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)
        clf = KNeighborsClassifier(n_neighbors=trial.suggest_int('n_neighbors', 3, 50), metric = 'cosine', n_jobs=-1)
        clf.fit(X_train_pca, y_train)
        y_pred = clf.predict(X_test_pca)
        mcc = matthews_corrcoef(y_test, y_pred)
        mcc_acc.append(mcc)
    hist = np.histogram(mcc_acc, bins=5)
    print(tabulate(hist))
    return np.mean(mcc_acc)

In [53]:
study = optuna.create_study(direction='maximize', storage="sqlite:///data/ID18150/Day2/pca_classifier.db", load_if_exists= True, study_name = "pca_optimization_cosine")
study.optimize(objective, n_trials=100)

[I 2024-11-07 01:41:38,785] Using an existing study with name 'pca_optimization_cosine' instead of creating a new one.


[I 2024-11-07 01:43:46,917] Trial 24 finished with value: 0.014992513825737774 and parameters: {'n_components': 221, 'n_neighbors': 2}. Best is trial 6 with value: 0.03912905172227376.


4	2	1	1	2
-0.04201782395667886	-0.012312027566406737	0.017393768823865383	0.047099565214137495	0.07680536160440962	0.10651115799468175


[W 2024-11-07 01:44:09,877] Trial 25 failed with parameters: {'n_components': 283, 'n_neighbors': 7} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/python/3.10.13/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_2724/3076228011.py", line 14, in objective
    y_pred = clf.predict(X_test_pca)
  File "/usr/local/python/3.10.13/lib/python3.10/site-packages/sklearn/neighbors/_classification.py", line 271, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "/usr/local/python/3.10.13/lib/python3.10/site-packages/sklearn/neighbors/_base.py", line 886, in kneighbors
    chunked_results = list(
  File "/usr/local/python/3.10.13/lib/python3.10/site-packages/sklearn/metrics/pairwise.py", line 2181, in pairwise_distances_chunked
    D_chunk = reduce_func(D_chunk, sl.start)
  File "/usr/local/python/3.10.13/lib/python3.10/site-p

KeyboardInterrupt: 