In [1]:
import __init__
import os
import numpy as np
import pandas as pd
import yaml
import joblib
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from datapath_manager import DataPathManager, ITWDataPathManager
from trainers import MachineLearningModelTrainer, BranchNeuralNetworkTrainer
from dataloader import EmbeddingDataLoader
from evaluators import Evaluator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
user_id = 'ltkhiem'
dates = ['2022-10-16', '2022-10-17', '2022-10-18', '2022-10-20', '2022-10-21', '2022-10-22']

In [3]:
model_path = os.path.abspath(f'./models/{user_id}')

# Lab-based model for the targeted user

In [4]:
lab_dataset_name = 'DCU_NVT_EXP2'
window_size = 60
window_shift = 0.25
signal_type = 'bvp_eda_temp'

In [5]:
dp_manager = DataPathManager(lab_dataset_name)
lab_feature_folder_path = os.path.dirname(dp_manager.get_feature_path(user_id, signal_type, window_size, window_shift))
ground_truth_path = os.path.join(lab_feature_folder_path, 'ground_truth.npy')
tasks_index_path = os.path.join(lab_feature_folder_path, 'tasks_index.npy')

In [6]:
lab_features = []
signals = ['bvp', 'eda', 'temp']
for signal in signals:
    signal_path = os.path.join(lab_feature_folder_path, f'{signal}.npy')
    signal_data = np.load(signal_path)
    lab_features.append(signal_data)
lab_features = np.concatenate(lab_features, axis=1)
tasks_index = np.load(tasks_index_path)
ground_truth = np.load(ground_truth_path) 

In [7]:
# Split train/test lab-based model
def split_train_test(indices, test_size: float = 0.3):
        """
        Split train and test data for subject-dependent model training:
            - Train_data: (1 - test_size) * number of data of a class
            - Test_data: test_size * number of data of a class 
        NOTE: This means that this approach of data splitting simulate the real-life situation 
        where the test data is the segment of data that is recorded later after we have the train data.
        """
        cut_point = int((1 - test_size) * len(indices))
        train_indices = indices[:cut_point].tolist()
        test_indices = indices[cut_point:].tolist()
        return train_indices, test_indices


TEST_SIZE = 0.2
VALID_SIZE = 0.1
indices = np.arange(lab_features.shape[0]) # The indices of the lab-based features
train_indices, valid_indices, test_indices = [], [], []
for _, task_test_index in LeaveOneGroupOut().split(indices, y=None, groups=tasks_index):
    task_train_indices, task_test_indices = split_train_test(indices[task_test_index], test_size = TEST_SIZE)
    task_train_indices, task_valid_indices = split_train_test(indices[task_train_indices], test_size = VALID_SIZE)
    train_indices += task_train_indices
    valid_indices += task_valid_indices
    test_indices += task_test_indices

### Train Deep-Fusion Model

In [8]:
X_train, y_train = lab_features[train_indices], ground_truth[train_indices]
X_valid, y_valid = lab_features[valid_indices], ground_truth[valid_indices]
X_test, y_test = lab_features[test_indices], ground_truth[test_indices]

In [9]:
train_dataloader = EmbeddingDataLoader(X_train, y_train)
validate_dataloader = EmbeddingDataLoader(X_valid, y_valid)
test_dataloader = EmbeddingDataLoader(X_test, y_test)

In [10]:
# Load deep model configuration
user_model_saved_path = os.path.join(f'{model_path}/lab_deep_fusion.pth')
config_path = os.path.join(
    os.path.dirname(os.getcwd()), 
    'models', 'model_config', 
    f'branchnn_sensor_combination_{signal_type}.yaml'
)
config_dict = yaml.safe_load(open(config_path, 'r'))

In [11]:
df_clf = BranchNeuralNetworkTrainer('./logs.txt', 
    user_model_saved_path, 
    config_dict, 
    target_metrics=['balanced_accuracy', 'f1'],
)

LOAD PRETRAINED MODEL


In [12]:
# df_clf.train(train_dataloader, validate_dataloader, num_epochs=1000)

In [13]:
y_pred = df_clf.predict(test_dataloader)
print(Evaluator().evaluate(y_test, y_pred))

{'accuracy': 0.7399913904433921, 'balanced_accuracy': 0.6873414337788578, 'precision': 0.7622950819672131, 'recall': 0.868, 'f1': 0.8117206982543641}


### Train ExtraTreesClassifier Model

In [14]:
X_train = np.concatenate([lab_features[train_indices], lab_features[valid_indices]], axis=0)
y_train = np.concatenate([ground_truth[train_indices], ground_truth[valid_indices]], axis=0)
X_test, y_test = lab_features[test_indices], ground_truth[test_indices]

In [15]:
et_clf = ExtraTreesClassifier(
    n_estimators = 500,
    random_state = 0, 
    n_jobs = -1, 
    max_features = 'sqrt', 
    max_depth = 8, 
    min_samples_split = 2, 
    min_samples_leaf = 8,
    oob_score = True, 
    bootstrap = True, 
    class_weight = 'balanced'
)

In [16]:
et_clf.fit(X_train, y_train)

In [17]:
y_pred = et_clf.predict(X_test)
print(Evaluator().evaluate(y_test, y_pred))

{'accuracy': 0.8949634093844167, 'balanced_accuracy': 0.8887788578371811, 'precision': 0.9260515603799185, 'recall': 0.91, 'f1': 0.9179556153328849}


In [18]:
data = {'model': et_clf, 'scaler': None}
joblib.dump(data, os.path.join(model_path, 'lab_et_clf.pkl'))

['/home/nvtu/PhD_Work/ExperimentProtocol2/stress_detection/experiment_scripts/models/ltkhiem/lab_et_clf.pkl']

### Train Logistic Regression Model

In [19]:
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(np.concatenate([lab_features[train_indices], lab_features[valid_indices]], axis=0))
y_train = np.concatenate([ground_truth[train_indices], ground_truth[valid_indices]], axis=0)
X_test, y_test = std_scaler.transform(lab_features[test_indices]), ground_truth[test_indices]

In [20]:
lr_classifier = LogisticRegression(
    random_state = 0,
    class_weight = 'balanced',
    n_jobs = -1,
    solver = 'saga',
    max_iter = 50000,
)

In [21]:
lr_classifier.fit(X_train, y_train)

In [22]:
y_pred = lr_classifier.predict(X_test)
print(Evaluator().evaluate(y_test, y_pred))

{'accuracy': 0.7619457597933706, 'balanced_accuracy': 0.6983090319967598, 'precision': 0.7626178591236827, 'recall': 0.9166666666666666, 'f1': 0.8325764456554647}


In [23]:
data = {'model': lr_classifier, 'scaler': std_scaler}
joblib.dump(data, os.path.join(model_path, 'lab_lr_clf.pkl'))

['/home/nvtu/PhD_Work/ExperimentProtocol2/stress_detection/experiment_scripts/models/ltkhiem/lab_lr_clf.pkl']

# In-the-wild model for the targeted user

In [24]:
itw_dataset_name = 'DCU_EXP2_ITW'

In [25]:
def get_features_and_labels(user_id: str, date: str,):
    dataset_path = ITWDataPathManager(itw_dataset_name).get_dataset_path()

    user_date_feature_path = os.path.join(dataset_path, 'features', user_id, date)
    feature_path = os.path.join(user_date_feature_path, 'X.npy')
    gt_path = os.path.join(user_date_feature_path, 'y.npy')

    feat = np.nan_to_num(np.load(feature_path))[:, :72]
    gt = np.load(gt_path)
    return feat, gt

In [26]:
# Load data
feat_dates = [get_features_and_labels(user_id, date) for date in dates]
for x in feat_dates:
    print(Counter(x[1]))

Counter({0.0: 16619, 1.0: 1242})
Counter({0.0: 30000, 1.0: 7224})
Counter({0.0: 39579, 1.0: 6050})
Counter({0.0: 48606})
Counter({0.0: 23292, 1.0: 5447})
Counter({0.0: 7408, 1.0: 2120})


## 1. Apply lab-based model to in-the-wild data

In [27]:
X_test_itw = np.concatenate([x[0] for x in feat_dates], axis=0)
y_test_itw = np.concatenate([x[1] for x in feat_dates], axis=0)

In [28]:
itw_test_dataloader = EmbeddingDataLoader(X_test_itw, y_test_itw)

In [29]:
itw_user_model_saved_path = os.path.abspath(f'{model_path}/itw_deep_fusion.pth')
itw_df_clf = BranchNeuralNetworkTrainer('./logs.txt',
    itw_user_model_saved_path,
    config_dict,
    target_metrics=['balanced_accuracy', 'f1'],
    pretrained_model_path = user_model_saved_path
)
itw_et_clf = joblib.load(os.path.join(model_path, 'lab_et_clf.pkl'))['model']
itw_lr_clf = joblib.load(os.path.join(model_path, 'lab_lr_clf.pkl'))['model']
std_scaler = joblib.load(os.path.join(model_path, 'lab_lr_clf.pkl'))['scaler']

LOAD PRETRAINED MODEL


In [30]:
# Lab-based Deep Fusion model applied to ITW data
print("--- Deep Fusion ---")
y_pred_itw = itw_df_clf.predict(itw_test_dataloader)
print(Evaluator().evaluate(y_test_itw, y_pred_itw))
print("--- Extra Trees ---")
y_pred_itw = itw_et_clf.predict(X_test_itw)
print(Evaluator().evaluate(y_test_itw, y_pred_itw))
print("--- Logistic Regression ---")
y_pred_itw = itw_lr_clf.predict(std_scaler.transform(X_test_itw))
print(Evaluator().evaluate(y_test_itw, y_pred_itw))

--- Deep Fusion ---
{'accuracy': 0.6744763762947326, 'balanced_accuracy': 0.6471355738561426, 'precision': 0.2046133094888077, 'recall': 0.6113752660417515, 'f1': 0.30661095087775075}
--- Extra Trees ---
{'accuracy': 0.5282882076050046, 'balanced_accuracy': 0.5961920156670315, 'precision': 0.15650023795236814, 'recall': 0.6850065661368473, 'f1': 0.2547898367034133}
--- Logistic Regression ---
{'accuracy': 0.6167271719255599, 'balanced_accuracy': 0.5744603678724267, 'precision': 0.15760750027493675, 'recall': 0.5191776479644976, 'f1': 0.24180876754510847}


## 2. Fine-tune the lab-based model to adapt to the in-the-wild data

In [31]:
# Use the first 3 days for training
X_train = np.concatenate([x[0] for x in feat_dates[:3]])
y_train = np.concatenate([x[1] for x in feat_dates[:3]]).astype(int)
# Use the last 3 days for testing
X_test = np.concatenate([x[0] for x in feat_dates[3:]])
y_test = np.concatenate([x[1] for x in feat_dates[3:]]).astype(int)

### Fine-tune Deep-Fusion Model

In [32]:
itw_train_dataloader = EmbeddingDataLoader(X_train, y_train)
itw_test_dataloader = EmbeddingDataLoader(X_test, y_test)

In [33]:
itw_df_clf.train(itw_train_dataloader, itw_test_dataloader, num_epochs=1000)

----> Epoch: 0, Loss: 2.6203766441345215, Evaluation: {'balanced_accuracy': 0.6469677815634836, 'f1': 0.6623414865827595}
----> Epoch: 10, Loss: 2.6200260496139527, Evaluation: {'balanced_accuracy': 0.6467170978585115, 'f1': 0.6641038234849629}
----> Epoch: 20, Loss: 2.622978911399841, Evaluation: {'balanced_accuracy': 0.6460528131213199, 'f1': 0.6639571506704638}
----> Epoch: 30, Loss: 2.6200963163375857, Evaluation: {'balanced_accuracy': 0.6473954259334265, 'f1': 0.6620480953151318}
----> Epoch: 40, Loss: 2.6183602905273435, Evaluation: {'balanced_accuracy': 0.6479205545113517, 'f1': 0.6632736388324438}
----> Epoch: 50, Loss: 2.6200092315673826, Evaluation: {'balanced_accuracy': 0.6483421636465195, 'f1': 0.6634163031391669}
----> Epoch: 60, Loss: 2.618656029701233, Evaluation: {'balanced_accuracy': 0.6471079464091272, 'f1': 0.6631076890983397}
----> Epoch: 70, Loss: 2.6159680128097533, Evaluation: {'balanced_accuracy': 0.6506583066618485, 'f1': 0.6645824198867644}
----> Epoch: 80, Lo

{'balanced_accuracy': 0.6183724672509305, 'f1': 0.237748128864302}

In [34]:
y_pred_itw = itw_df_clf.predict(itw_test_dataloader)
print(Evaluator().evaluate(y_test, y_pred_itw))

{'accuracy': 0.7303650155974813, 'balanced_accuracy': 0.6183724672509305, 'precision': 0.15770841428139706, 'recall': 0.48275406369763446, 'f1': 0.237748128864302}


## 3. Re-train ML Model

### Re-train ExtraTreesClassifier Model

In [35]:
itw_et_clf = ExtraTreesClassifier(
    n_estimators = 500,
    random_state = 0, 
    n_jobs = -1, 
    max_features = 'sqrt', 
    max_depth = 8, 
    min_samples_split = 0.005, 
    min_samples_leaf = 0.005,
    oob_score = True, 
    bootstrap = True, 
    class_weight = 'balanced'
)
itw_et_clf.fit(X_train, y_train)

In [36]:
y_pred_itw = itw_et_clf.predict(X_test)
print(Evaluator().evaluate(y_test, y_pred_itw))

{'accuracy': 0.6913080013352826, 'balanced_accuracy': 0.6700814213760298, 'precision': 0.1681263361147507, 'recall': 0.6443768996960486, 'f1': 0.2666739588175777}


### Re-train Logistic Regression Model

In [37]:
scaler = StandardScaler()

In [38]:
itw_lr_clf = LogisticRegression(
    random_state = 0,
    class_weight = 'balanced',
    n_jobs = -1,
    solver = 'saga',
    max_iter = 10000,
)
itw_lr_clf.fit(scaler.fit_transform(X_train), y_train)

KeyboardInterrupt: 

In [None]:
y_pred_itw = itw_lr_clf.predict(scaler.transform(X_test))
print(Evaluator().evaluate(y_test, y_pred_itw))