# Chronotype Classification

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

from biopsykit.classification.utils import prepare_df_sklearn, split_train_test

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import LeaveOneGroupOut, StratifiedKFold, GroupKFold, GridSearchCV, cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


from tqdm.notebook import tqdm

%matplotlib widget
%load_ext autoreload
%autoreload 2

In [None]:
export_path = Path("../../exports")
result_path = Path("../../results/classification")
result_path.mkdir(exist_ok=True, parents=True)

In [None]:
data = pd.read_csv(export_path.joinpath("chronotype_features_complete.csv"), index_col=['subject', 'night'])
data.head()

In [None]:
data.columns

### Data Selection

Drop Label '2' (Evening Type)

In [None]:
data.rename(columns={'within_ideal_bed_time': 'label'}, inplace=True)
data.set_index('label', append=True, inplace=True)

In [None]:
#data = data.drop(0.0, level='label')

## Functions

In [None]:
def evaluate_model(X, y, groups, params, pipeline, scoring='f1'):
    results_dict = {key: [] for key in ['cv_results', 'test_score', 'best_estimator', 'conf_matrix']}
    cv = LeaveOneGroupOut()
    outer_cv = GroupKFold(n_splits=5)
    plt.ioff()
    for train, test in tqdm(list(outer_cv.split(X, y, groups))):
        X_train, X_test, y_train, y_test, groups_train, groups_test = split_train_test(X, y, train, test, groups)
        grid = GridSearchCV(pipeline, param_grid=params, cv=cv.split(X_train, y_train, groups_train), scoring=scoring, n_jobs=-1, verbose=True)
        grid.fit(X_train, y_train)
        
        conf_matrix = confusion_matrix(y_test, grid.predict(X_test), normalize='true')
        results_dict['test_score'].append(grid.score(X_test, y_test))
        results_dict['cv_results'].append(grid.cv_results_)
        results_dict['best_estimator'].append(grid.best_estimator_)
        results_dict['conf_matrix'].append(conf_matrix)
        
    plt.ion()
    
    return results_dict

## Classification

In [None]:
X, y, groups, group_keys = prepare_df_sklearn(data, print_summary=True)
#y = y - 1
#y

In [None]:
X_scale = MinMaxScaler().fit_transform(X)
pca = PCA(n_components=2)
X_trans = pca.fit_transform(X_scale)

In [None]:
df_test = pd.DataFrame(X_trans, index=y)

In [None]:
g = sns.pairplot(data=df_test.reset_index(), hue='index')

### kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
pipeline_knn = Pipeline([('scale', MinMaxScaler()), ('reduce_dim', 'passthrough'), ('clf', KNeighborsClassifier())])

In [None]:
N_COMPONENTS_OPTIONS = [2, 4, 8, None]
K_OPTIONS = [2, 4, 8, 'all']
N_NEIGHBORS_OPTIONS = np.arange(3, 12, 2)

params_knn = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_COMPONENTS_OPTIONS,
        'clf__n_neighbors': N_NEIGHBORS_OPTIONS,
    },
    {
        'reduce_dim': [SelectKBest()],
        'reduce_dim__k': K_OPTIONS,
        'clf__n_neighbors': N_NEIGHBORS_OPTIONS,
    }
]

In [None]:
results_dict_knn = evaluate_model(X, y, groups, params_knn, pipeline_knn, 'f1')

In [None]:
mean_score = np.mean(results_dict_knn['test_score'])
mean_score

In [None]:
fig, ax = plt.subplots()
ConfusionMatrixDisplay(results_dict_knn['conf_matrix'][0]).plot(cmap=plt.cm.Blues, ax=ax)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
pipeline_dt = Pipeline([('scale', MinMaxScaler()), ('clf', DecisionTreeClassifier())])

In [None]:
params_dt = {
    'clf__criterion': ['entropy'],
    'clf__max_depth': np.append(np.arange(3, 10), None),
    'clf__min_samples_split': np.arange(0.01, 0.1, 0.02),
    'clf__min_samples_leaf': np.arange(0.01, 0.05, 0.01),
    'clf__max_features': [None, 'log2']
}

In [None]:
results_dict_dt = evaluate_model(X, y, groups, params_dt, pipeline_dt, 'f1')

In [None]:
mean_score = np.mean(results_dict_dt['test_score'])
mean_score

### SVM

In [None]:
from sklearn.svm import SVC

In [None]:
pipeline_svm = Pipeline([('scale', MinMaxScaler()), ('reduce_dim', 'passthrough'), ('clf', SVC())])

In [None]:
C_OPTIONS = np.logspace(start=-4, stop=4, num=9)
GAMMA_OPTIONS = np.append(np.logspace(start=-4, stop=3, num=8), 'scale')
N_COMPONENTS_OPTIONS = [2]#, 4, 8, None]
K_OPTIONS = [2, 4, 8, 'all']

params_svm = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_COMPONENTS_OPTIONS,
        'clf__C': C_OPTIONS,
        'clf__kernel': ['linear']
    },
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_COMPONENTS_OPTIONS,
        'clf__C': C_OPTIONS,
        'clf__gamma': GAMMA_OPTIONS,
        'clf__kernel': ['rbf']
    },
    #{
    #    'reduce_dim': [SelectKBest()],
    #    'reduce_dim__k': K_OPTIONS,
    #    'clf__C': C_OPTIONS,
    #    'clf__gamma': GAMMA_OPTIONS,
    #    'clf__kernel': ['linear', 'rbf']
    #}
]

In [None]:
from sklearn import set_config
set_config(display='diagram')   
# diplays HTML representation in a jupyter context
results_dict_svm['best_estimator'][0]

In [None]:
results_dict_svm = evaluate_model(X, y, groups, params_svm, pipeline_svm, 'f1')

In [None]:
mean_score = np.mean(results_dict_svm['test_score'])
mean_score