In [None]:
import __init__
import os
import numpy as np
import pandas as pd
import yaml
import joblib
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from datapath_manager import DataPathManager, ITWDataPathManager
from trainers import MachineLearningModelTrainer, BranchNeuralNetworkTrainer
from dataloader import EmbeddingDataLoader
from evaluators import Evaluator

In [None]:
user_id = 'ltkhiem'
dates = ['2022-10-16', '2022-10-17', '2022-10-18', '2022-10-20', '2022-10-21', '2022-10-22']

In [None]:
model_path = os.path.abspath(f'./models/{user_id}')

# Lab-based model for the targeted user

In [None]:
lab_dataset_name = 'DCU_NVT_EXP2'
window_size = 60
window_shift = 0.25
signal_type = 'bvp_eda_temp'

In [None]:
dp_manager = DataPathManager(lab_dataset_name)
lab_feature_folder_path = os.path.dirname(dp_manager.get_feature_path(user_id, signal_type, window_size, window_shift))
ground_truth_path = os.path.join(lab_feature_folder_path, 'ground_truth.npy')
tasks_index_path = os.path.join(lab_feature_folder_path, 'tasks_index.npy')

In [None]:
lab_features = []
signals = ['bvp', 'eda', 'temp']
for signal in signals:
    signal_path = os.path.join(lab_feature_folder_path, f'{signal}.npy')
    signal_data = np.load(signal_path)
    lab_features.append(signal_data)
lab_features = np.concatenate(lab_features, axis=1)
tasks_index = np.load(tasks_index_path)
ground_truth = np.load(ground_truth_path) 

In [None]:
# Split train/test lab-based model
def split_train_test(indices, test_size: float = 0.3):
        """
        Split train and test data for subject-dependent model training:
            - Train_data: (1 - test_size) * number of data of a class
            - Test_data: test_size * number of data of a class 
        NOTE: This means that this approach of data splitting simulate the real-life situation 
        where the test data is the segment of data that is recorded later after we have the train data.
        """
        cut_point = int((1 - test_size) * len(indices))
        train_indices = indices[:cut_point].tolist()
        test_indices = indices[cut_point:].tolist()
        return train_indices, test_indices


TEST_SIZE = 0.2
VALID_SIZE = 0.1
indices = np.arange(lab_features.shape[0]) # The indices of the lab-based features
train_indices, valid_indices, test_indices = [], [], []
for _, task_test_index in LeaveOneGroupOut().split(indices, y=None, groups=tasks_index):
    task_train_indices, task_test_indices = split_train_test(indices[task_test_index], test_size = TEST_SIZE)
    task_train_indices, task_valid_indices = split_train_test(indices[task_train_indices], test_size = VALID_SIZE)
    train_indices += task_train_indices
    valid_indices += task_valid_indices
    test_indices += task_test_indices

### Train Deep-Fusion Model

In [None]:
X_train, y_train = lab_features[train_indices], ground_truth[train_indices]
X_valid, y_valid = lab_features[valid_indices], ground_truth[valid_indices]
X_test, y_test = lab_features[test_indices], ground_truth[test_indices]

In [None]:
train_dataloader = EmbeddingDataLoader(X_train, y_train)
validate_dataloader = EmbeddingDataLoader(X_valid, y_valid)
test_dataloader = EmbeddingDataLoader(X_test, y_test)

In [None]:
# Load deep model configuration
user_model_saved_path = os.path.join(f'{model_path}/lab_deep_fusion.pth')
config_path = os.path.join(
    os.path.dirname(os.getcwd()), 
    'models', 'model_config', 
    f'branchnn_sensor_combination_{signal_type}.yaml'
)
config_dict = yaml.safe_load(open(config_path, 'r'))

In [None]:
df_clf = BranchNeuralNetworkTrainer('./logs.txt', 
    user_model_saved_path, 
    config_dict, 
    target_metrics=['balanced_accuracy', 'f1'],
)

In [None]:
# df_clf.train(train_dataloader, validate_dataloader, num_epochs=1000)

In [None]:
y_pred = df_clf.predict(test_dataloader)
print(Evaluator().evaluate(y_test, y_pred))

### Train ExtraTreesClassifier Model

In [None]:
X_train = np.concatenate([lab_features[train_indices], lab_features[valid_indices]], axis=0)
y_train = np.concatenate([ground_truth[train_indices], ground_truth[valid_indices]], axis=0)
X_test, y_test = lab_features[test_indices], ground_truth[test_indices]

In [None]:
et_clf = ExtraTreesClassifier(
    n_estimators = 500,
    random_state = 0, 
    n_jobs = -1, 
    max_features = 'sqrt', 
    max_depth = 8, 
    min_samples_split = 2, 
    min_samples_leaf = 8,
    oob_score = True, 
    bootstrap = True, 
    class_weight = 'balanced'
)

In [None]:
et_clf.fit(X_train, y_train)

In [None]:
y_pred = et_clf.predict(X_test)
print(Evaluator().evaluate(y_test, y_pred))

In [None]:
data = {'model': et_clf, 'scaler': None}
joblib.dump(data, os.path.join(model_path, 'lab_et_clf.pkl'))

### Train Logistic Regression Model

In [None]:
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(np.concatenate([lab_features[train_indices], lab_features[valid_indices]], axis=0))
y_train = np.concatenate([ground_truth[train_indices], ground_truth[valid_indices]], axis=0)
X_test, y_test = std_scaler.transform(lab_features[test_indices]), ground_truth[test_indices]

In [None]:
lr_classifier = LogisticRegression(
    random_state = 0,
    class_weight = 'balanced',
    n_jobs = -1,
    solver = 'saga',
    max_iter = 50000,
)

In [None]:
lr_classifier.fit(X_train, y_train)

In [None]:
y_pred = lr_classifier.predict(X_test)
print(Evaluator().evaluate(y_test, y_pred))

In [None]:
data = {'model': lr_classifier, 'scaler': std_scaler}
joblib.dump(data, os.path.join(model_path, 'lab_lr_clf.pkl'))

# In-the-wild model for the targeted user

In [None]:
itw_dataset_name = 'DCU_EXP2_ITW'

In [None]:
def get_features_and_labels(user_id: str, date: str,):
    dataset_path = ITWDataPathManager(itw_dataset_name).get_dataset_path()

    user_date_feature_path = os.path.join(dataset_path, 'features', user_id, date)
    feature_path = os.path.join(user_date_feature_path, 'X.npy')
    gt_path = os.path.join(user_date_feature_path, 'y.npy')

    feat = np.nan_to_num(np.load(feature_path))[:, :72]
    gt = np.load(gt_path)
    return feat, gt

In [None]:
# Load data
feat_dates = [get_features_and_labels(user_id, date) for date in dates]
for x in feat_dates:
    print(Counter(x[1]))

## 1. Apply lab-based model to in-the-wild data

In [None]:
X_test_itw = np.concatenate([x[0] for x in feat_dates], axis=0)
y_test_itw = np.concatenate([x[1] for x in feat_dates], axis=0)

In [None]:
itw_test_dataloader = EmbeddingDataLoader(X_test_itw, y_test_itw)

In [None]:
itw_user_model_saved_path = os.path.abspath(f'{model_path}/itw_deep_fusion.pth')
itw_df_clf = BranchNeuralNetworkTrainer('./logs.txt',
    itw_user_model_saved_path,
    config_dict,
    target_metrics=['balanced_accuracy', 'f1'],
    pretrained_model_path = user_model_saved_path
)
itw_et_clf = joblib.load(os.path.join(model_path, 'lab_et_clf.pkl'))['model']
itw_lr_clf = joblib.load(os.path.join(model_path, 'lab_lr_clf.pkl'))['model']
std_scaler = joblib.load(os.path.join(model_path, 'lab_lr_clf.pkl'))['scaler']

In [None]:
# Lab-based Deep Fusion model applied to ITW data
print("--- Deep Fusion ---")
y_pred_itw = itw_df_clf.predict(itw_test_dataloader)
print(Evaluator().evaluate(y_test_itw, y_pred_itw))
print("--- Extra Trees ---")
y_pred_itw = itw_et_clf.predict(X_test_itw)
print(Evaluator().evaluate(y_test_itw, y_pred_itw))
print("--- Logistic Regression ---")
y_pred_itw = itw_lr_clf.predict(std_scaler.transform(X_test_itw))
print(Evaluator().evaluate(y_test_itw, y_pred_itw))

## 2. Fine-tune the lab-based model to adapt to the in-the-wild data

In [None]:
# Use the first 3 days for training
X_train = np.concatenate([x[0] for x in feat_dates[:3]])
y_train = np.concatenate([x[1] for x in feat_dates[:3]]).astype(int)
# Use the last 3 days for testing
X_test = np.concatenate([x[0] for x in feat_dates[3:]])
y_test = np.concatenate([x[1] for x in feat_dates[3:]]).astype(int)

### Fine-tune Deep-Fusion Model

In [None]:
itw_train_dataloader = EmbeddingDataLoader(X_train, y_train)
itw_test_dataloader = EmbeddingDataLoader(X_test, y_test)

In [None]:
itw_df_clf.train(itw_train_dataloader, itw_test_dataloader, num_epochs=1000)

In [None]:
y_pred_itw = itw_df_clf.predict(itw_test_dataloader)
print(Evaluator().evaluate(y_test, y_pred_itw))

## 3. Re-train ML Model

### Re-train ExtraTreesClassifier Model

In [None]:
itw_et_clf = ExtraTreesClassifier(
    n_estimators = 500,
    random_state = 0, 
    n_jobs = -1, 
    max_features = 'sqrt', 
    max_depth = 8, 
    min_samples_split = 0.005, 
    min_samples_leaf = 0.005,
    oob_score = True, 
    bootstrap = True, 
    class_weight = 'balanced'
)
itw_et_clf.fit(X_train, y_train)

In [None]:
y_pred_itw = itw_et_clf.predict(X_test)
print(Evaluator().evaluate(y_test, y_pred_itw))

### Re-train Logistic Regression Model

In [None]:
scaler = StandardScaler()

In [None]:
itw_lr_clf = LogisticRegression(
    random_state = 0,
    class_weight = 'balanced',
    n_jobs = -1,
    solver = 'saga',
    max_iter = 10000,
)
itw_lr_clf.fit(scaler.fit_transform(X_train), y_train)

In [None]:
y_pred_itw = itw_lr_clf.predict(scaler.transform(X_test))
print(Evaluator().evaluate(y_test, y_pred_itw))