# Stress detection

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

## Importing data

In [2]:
df = pd.read_csv('datasets/FeaturesDatabase.csv', parse_dates=['datetime'])

In [3]:
df = df.drop(columns = ['ID','datetime'])

In [4]:
df = df.loc[(df.tag == 1) | (df.tag == 2) | (df.tag == 3)].reset_index(drop=True)

In [5]:
df

Unnamed: 0,tonic_mean,tonic_std,tonic_min,tonic_max,tonic_kurtosis,tonic_peaks,tonic_n_sign_changes,tonic_entropy,phasic_mean,phasic_std,...,bvp_min,bvp_max,bvp_std,bvp_peaks,bvp_sign_changes,temp_mean,temp_min,temp_max,temp_std,tag
0,1.217555e-01,1.110994e-02,0.109972,1.398770e-01,-1.335396,0.0,0.0,5.476544,3.966073e-03,6.319387e-03,...,-100.3250,92.270781,23.415148,17.0,29.0,31.619000,31.59,31.65,0.015806,3
1,1.211121e-01,1.058294e-02,0.109623,1.400274e-01,-1.026073,0.0,0.0,5.476902,4.134432e-03,6.859174e-03,...,-100.3250,92.270781,23.419873,18.0,29.0,31.619000,31.59,31.65,0.015806,3
2,1.206340e-01,1.034728e-02,0.110152,1.398311e-01,-0.887069,0.0,0.0,5.477046,4.425717e-03,7.012810e-03,...,-100.3250,92.270781,23.411065,17.0,30.0,31.618667,31.59,31.65,0.015345,3
3,1.202648e-01,1.027256e-02,0.109644,1.397805e-01,-0.850265,0.0,0.0,5.477077,4.757627e-03,7.258606e-03,...,-100.3250,92.270781,22.234918,17.0,29.0,31.619000,31.59,31.65,0.015806,3
4,1.199976e-01,1.017837e-02,0.110638,1.398852e-01,-0.800499,0.0,0.0,5.477129,5.296997e-03,8.345932e-03,...,-100.3250,92.270781,19.516206,18.0,29.0,31.619333,31.59,31.65,0.016247,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360867,-5.185897e-07,3.029454e-07,-0.000001,4.370949e-08,0.936544,0.0,1.0,-inf,5.185897e-07,3.957395e-07,...,-0.5075,0.609219,0.277044,3.0,37.0,23.263333,23.25,23.29,0.012026,3
360868,-5.185897e-07,3.029454e-07,-0.000001,4.370949e-08,0.936544,0.0,1.0,-inf,5.185897e-07,3.957395e-07,...,-0.5075,0.609219,0.267675,3.0,36.0,23.263333,23.25,23.29,0.012026,3
360869,-5.185897e-07,3.029454e-07,-0.000001,4.370949e-08,0.936544,0.0,1.0,-inf,5.185897e-07,3.957395e-07,...,-0.5075,0.609219,0.269484,3.0,37.0,23.263667,23.25,23.29,0.011927,3
360870,-5.185897e-07,3.029454e-07,-0.000001,4.370949e-08,0.936544,0.0,1.0,-inf,5.185897e-07,3.957395e-07,...,-0.5075,0.609219,0.273738,3.0,37.0,23.263000,23.23,23.29,0.012663,3


In [6]:
df['tag'].value_counts()

3    323314
1     19276
2     18282
Name: tag, dtype: int64

## Features selection

### Creating train and test datasets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['tag'], axis=1),
    df['tag'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((252610, 50), (108262, 50))

In [8]:
# dealing with infinite values
X_train = X_train.replace((np.inf), 9999.999).reset_index(drop=True)
X_test = X_test.replace((np.inf), 9999.999).reset_index(drop=True)
X_train = X_train.replace((-np.inf), -9999.999).reset_index(drop=True)
X_test = X_test.replace((-np.inf), -9999.999).reset_index(drop=True)

In [9]:
# keep a copy of the dataset with all the variables to measure the performance of machine learning models at the end 
X_train_original = X_train.copy()
X_test_original = X_test.copy()

In [10]:
X_train_original.columns

Index(['tonic_mean', 'tonic_std', 'tonic_min', 'tonic_max', 'tonic_kurtosis',
       'tonic_peaks', 'tonic_n_sign_changes', 'tonic_entropy', 'phasic_mean',
       'phasic_std', 'phasic_min', 'phasic_max', 'phasic_kurtosis',
       'phasic_peaks', 'phasic_n_sign_changes', 'phasic_entropy', 'acc_x_mean',
       'acc_x_std', 'acc_x_min', 'acc_x_max', 'acc_x_kurtosis', 'acc_x_peaks',
       'acc_x_n_sign_changes', 'acc_x_entropy', 'acc_y_mean', 'acc_y_std',
       'acc_y_min', 'acc_y_max', 'acc_y_kurtosis', 'acc_y_peaks',
       'acc_y_n_sign_changes', 'acc_y_entropy', 'acc_z_mean', 'acc_z_std',
       'acc_z_min', 'acc_z_max', 'acc_z_kurtosis', 'acc_z_peaks',
       'acc_z_n_sign_changes', 'acc_z_entropy', 'bvp_mean', 'bvp_min',
       'bvp_max', 'bvp_std', 'bvp_peaks', 'bvp_sign_changes', 'temp_mean',
       'temp_min', 'temp_max', 'temp_std'],
      dtype='object')

# Filter methods

### Checking for constant features

In [11]:
# we do not have any constant features 
constant_features = [feat for feat in X_train.columns if X_train[feat].std() == 0]
len(constant_features)

0

### Checking for quasi-constant features

In [12]:
# we do not have any quasi-constant features
sel = VarianceThreshold(threshold=0.01)  # 0.1 indicates 99% of observations approximately
sel.fit(X_train)  # fit finds the features with low variance
sum(sel.get_support()) - len(X_train.columns) # how many quasi-constant

0

### Checking for duplicated features

In [13]:
# we do not have any duplicated features
duplicated_feat = []
for i in range(0, len(X_train.columns)):
    col_1 = X_train.columns[i]
    for col_2 in X_train.columns[i + 1:]:
        if X_train[col_1].equals(X_train[col_2]):
            duplicated_feat.append(col_2)
            
len(duplicated_feat)

0

### Checking for correlated features

In [14]:
# we have 23 correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  23


In [15]:
# removing correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((252610, 27), (108262, 27))

In [16]:
# keep a copy of the dataset at  this stage
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

In [17]:
X_train_corr.columns

Index(['tonic_mean', 'tonic_std', 'tonic_kurtosis', 'tonic_peaks',
       'tonic_n_sign_changes', 'phasic_min', 'phasic_kurtosis',
       'phasic_n_sign_changes', 'acc_x_mean', 'acc_x_std', 'acc_x_max',
       'acc_x_kurtosis', 'acc_x_peaks', 'acc_x_n_sign_changes', 'acc_y_mean',
       'acc_y_max', 'acc_y_kurtosis', 'acc_y_peaks', 'acc_y_n_sign_changes',
       'acc_z_mean', 'acc_z_min', 'acc_z_kurtosis', 'acc_z_n_sign_changes',
       'bvp_mean', 'bvp_min', 'bvp_sign_changes', 'temp_std'],
      dtype='object')

### Comparing the performance in machine learning algorithms

#### Logistic Regression with cross validation

In [18]:
# original
clf = LogisticRegressionCV(cv=10, multi_class='ovr', class_weight='balanced').fit(X_train_original, y_train)
target = clf.predict(X_test_original.fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.6552160499528921


In [19]:
# filter methods - correlation
clf = LogisticRegressionCV(cv=10, multi_class='ovr', class_weight='balanced').fit(X_train_corr, y_train)
target = clf.predict(X_test_corr.fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.6524265208475735


#### Naive Bayes

In [20]:
# original
gnb = GaussianNB().fit(X_train_original, y_train)
target = gnb.predict(X_test_original.fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.5556889767416083


In [21]:
# filter methods - correlation
gnb = GaussianNB().fit(X_train_corr, y_train)
target = gnb.predict(X_test_corr.fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.5390164600690917


#### K-nearest neighbors

In [22]:
# original
clf = KNeighborsClassifier(n_neighbors=3).fit(X_train_original, y_train)
target = clf.predict(X_test_original.fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.9956402061665219


In [23]:
# filter methods - correlation
clf = KNeighborsClassifier(n_neighbors=3).fit(X_train_corr, y_train)
target = clf.predict(X_test_corr.fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.9979678927047348


# Embedded methods

## Random forests

### Selecting features by random forests derived importance

In [24]:
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=400))
sel_.fit(X_train, y_train)

# remove features with zero coefficient from dataset and parse again as dataframe (output of sklearn is numpy array)
X_train_rf = pd.DataFrame(sel_.transform(X_train))
X_test_rf = pd.DataFrame(sel_.transform(X_test))

# add the columns name
X_train_rf.columns = X_train.columns[(sel_.get_support())]
X_test_rf.columns = X_train.columns[(sel_.get_support())]

In [25]:
X_train_rf.shape, X_test_rf.shape

((252610, 14), (108262, 14))

In [26]:
X_train_rf.columns

Index(['tonic_mean', 'tonic_std', 'acc_x_mean', 'acc_x_std', 'acc_x_max',
       'acc_x_peaks', 'acc_y_mean', 'acc_y_max', 'acc_y_peaks',
       'acc_y_n_sign_changes', 'acc_z_mean', 'acc_z_min', 'bvp_min',
       'temp_std'],
      dtype='object')

### Applying the model and mesuring performance

In [27]:
rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
rf.fit(X_train_rf, y_train)
target = rf.predict(X_test_rf.fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.8982560824666088


## Decision trees

### Selecting features

In [28]:
# selecting features
clf = tree.DecisionTreeClassifier(class_weight='balanced')
sel_ = SelectFromModel(clf)
sel_.fit(X_train.fillna(0), y_train)

SelectFromModel(estimator=DecisionTreeClassifier(class_weight='balanced'))

In [29]:
# adding the variable names and order it for clearer visualisation
selected_feat = X_train.columns[(sel_.get_support())]
len(selected_feat)

12

In [30]:
selected_feat

Index(['tonic_mean', 'tonic_std', 'acc_x_mean', 'acc_x_max', 'acc_x_peaks',
       'acc_y_mean', 'acc_y_kurtosis', 'acc_y_peaks', 'acc_y_n_sign_changes',
       'acc_z_mean', 'acc_z_n_sign_changes', 'bvp_min'],
      dtype='object')

### Applying the model and measuring performance

In [31]:
clf = tree.DecisionTreeClassifier(class_weight='balanced')
clf.fit(X_train[selected_feat].fillna(0), y_train)
target = clf.predict(X_test[selected_feat].fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.9921856237645711


## Gradient boosted trees

### Selecting features by gradient boosted trees importance

In [32]:
# selecting features all together in one go by contemplating their importance after fitting only 1 gradient boosted tree
sel_ = SelectFromModel(GradientBoostingClassifier())
sel_.fit(X_train.fillna(0), y_train)

SelectFromModel(estimator=GradientBoostingClassifier())

In [33]:
# adding the variable names and order it for clearer visualisation
selected_feat = X_train.columns[(sel_.get_support())]
len(selected_feat)

9

In [34]:
selected_feat

Index(['tonic_mean', 'tonic_std', 'acc_x_mean', 'acc_y_mean', 'acc_y_max',
       'acc_y_peaks', 'acc_z_mean', 'acc_z_min', 'bvp_min'],
      dtype='object')

### Applying the model and measuring performance

In [35]:
gb = GradientBoostingClassifier(n_estimators=200, random_state=39, max_depth=4)
gb.fit(X_train[selected_feat].fillna(0), y_train)
target = gb.predict(X_test[selected_feat].fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.9552936395041658


## Comparing models

| ML model                    | Number of features used | Accuracy of prediction  |
|-----------------------------|-------------------------|-------------------------|
| Logistic Regression with CV | 50                      | 0.655                   |
| Logistic Regression with CV | 27                      | 0.652                   |
| Naive Bayes                 | 50                      | 0.556                   |
| Naive Bayes                 | 27                      | 0.540                   |
| K-nearest neighbors         | 50                      | 0.996                   |
| **K-nearest neighbors**     | **27**                  | **0.998**               |
| Random Forest               | 14                      | 0.898                   |
| Decision Tree               | 12                      | 0.992                   |
| Gradient Boosted Tree       | 9                       | 0.955                   |

## Applying the KNN model to the whole dataset

In [36]:
clf = KNeighborsClassifier(n_neighbors=3).fit(X_train_corr, y_train)
target = clf.predict(X_test_corr.fillna(0))
print(f"Model's accuracy on the test set: {accuracy_score(y_test, target)}")

Model's accuracy on the test set: 0.9979678927047348


In [37]:
selected_feat = X_train_corr.columns

In [38]:
df = pd.read_csv('datasets/FeaturesDatabase.csv', parse_dates=['datetime'])
df = df.drop(columns = ['ID','datetime'])
df = df.loc[df.tag == 0].reset_index(drop=True)
df = df.drop(columns = 'tag')

In [39]:
df = df.replace(np.inf, 9999.999).reset_index(drop=True)
df = df.replace(-np.inf, -9999.999).reset_index(drop=True)

In [40]:
pred = clf.predict(df[selected_feat])
pred = pd.DataFrame(data = {'tag': pred})

In [41]:
pred.value_counts()

tag
3      1621156
2       162847
1       136008
dtype: int64

## Creating one common dataset with tags

In [44]:
df = pd.read_csv('datasets/FeaturesDatabase.csv', parse_dates=['datetime'])
df = df.loc[df.tag == 0].reset_index(drop=True)
df = df.drop(columns = 'tag')
df = pd.concat([df, pred], axis=1)

data = pd.read_csv('datasets/FeaturesDatabase.csv', parse_dates=['datetime'])
data = data.loc[(data.tag == 1) | (data.tag == 2) | (data.tag == 3)].reset_index(drop=True)

In [45]:
df_full = df.append(data, ignore_index=True)

In [46]:
df_full = df_full.sort_values(by=['ID','datetime'])

In [47]:
df_full = df_full.set_index(['ID','datetime'])

## Resulting data frame with predicted stress periods

In [48]:
df_full

Unnamed: 0_level_0,Unnamed: 1_level_0,tonic_mean,tonic_std,tonic_min,tonic_max,tonic_kurtosis,tonic_peaks,tonic_n_sign_changes,tonic_entropy,phasic_mean,phasic_std,...,bvp_min,bvp_max,bvp_std,bvp_peaks,bvp_sign_changes,temp_mean,temp_min,temp_max,temp_std,tag
ID,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A000C8,2019-07-17 06:34:11,0.457250,0.782798,-1.159493,1.189180,-0.686382,0.0,2.0,-inf,0.766455,0.786555,...,-141.496563,145.397188,41.913447,20.0,40.0,35.068833,34.99,35.13,0.039191,3
A000C8,2019-07-17 06:34:12,0.468825,0.760967,-0.940626,1.206356,-1.141705,0.0,2.0,-inf,0.768953,0.767381,...,-141.496563,145.397188,41.914589,21.0,41.0,35.070667,34.99,35.13,0.038483,3
A000C8,2019-07-17 06:34:13,0.476411,0.759664,-0.911919,1.214285,-1.178132,0.0,2.0,-inf,0.768545,0.760485,...,-141.496563,145.397188,41.937624,21.0,40.0,35.072167,34.99,35.13,0.037421,3
A000C8,2019-07-17 06:34:14,0.475620,0.738244,-0.791058,1.268565,-1.328799,0.0,2.0,-inf,0.775316,0.734746,...,-141.496563,145.397188,41.981359,20.0,40.0,35.074167,34.99,35.13,0.036140,3
A000C8,2019-07-17 06:34:15,0.502125,0.716291,-0.766346,1.287976,-1.263933,0.0,2.0,-inf,0.754728,0.708612,...,-141.496563,145.397188,41.979099,21.0,40.0,35.075833,34.99,35.13,0.034459,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AOOF3D(2) F,2019-07-25 10:29:48,1.814926,3.059592,-3.019855,9.070718,-1.161781,1.0,3.0,-inf,6.343827,2.521014,...,-307.415313,417.893281,115.810544,25.0,44.0,33.474833,30.01,34.53,1.584409,3
AOOF3D(2) F,2019-07-25 10:29:49,1.857763,3.332049,-3.563289,8.993588,-1.291271,1.0,3.0,-inf,6.169866,2.673415,...,-307.415313,417.893281,115.810544,25.0,44.0,33.474833,30.01,34.53,1.584409,3
AOOF3D(2) F,2019-07-25 10:29:50,1.787818,2.935411,-2.574139,7.683644,-1.420705,1.0,3.0,-inf,6.112002,2.665108,...,-307.415313,417.893281,115.810544,25.0,44.0,33.474833,30.01,34.53,1.584409,3
AOOF3D(2) F,2019-07-25 10:29:51,1.827658,2.868855,-2.042348,8.290597,-1.306433,1.0,3.0,-inf,5.948013,2.920829,...,-307.415313,417.893281,115.810544,25.0,44.0,33.474833,30.01,34.53,1.584409,3


## Stress periods separetely

In [49]:
stress = df_full.reset_index()[['ID','datetime','tag']]

In [50]:
# stress = pd.read_csv('stress_periods_knn.csv', parse_dates = ['datetime'])

In [51]:
stress.loc[stress['ID'] == 'A016902F', 'ID'] = 'A016902'
stress.loc[stress['ID'] == 'A016901F', 'ID'] = 'A016901'
stress.loc[stress['ID'] == 'AOOF3D(2) F', 'ID'] = 'A00F3D2'
stress.loc[stress['ID'] == 'A00F3D M', 'ID'] = 'A00F3D1'
stress.loc[stress['ID'] == 'A00520 F', 'ID'] = 'A00520'

In [52]:
stress

Unnamed: 0,ID,datetime,tag
0,A000C8,2019-07-17 06:34:11,3
1,A000C8,2019-07-17 06:34:12,3
2,A000C8,2019-07-17 06:34:13,3
3,A000C8,2019-07-17 06:34:14,3
4,A000C8,2019-07-17 06:34:15,3
...,...,...,...
2280878,A00F3D2,2019-07-25 10:29:48,3
2280879,A00F3D2,2019-07-25 10:29:49,3
2280880,A00F3D2,2019-07-25 10:29:50,3
2280881,A00F3D2,2019-07-25 10:29:51,3


In [53]:
stress['tag'].value_counts()

3    1944470
2     181129
1     155284
Name: tag, dtype: int64

In [54]:
for id in stress.ID.unique():
    print(id)
    print(f'Seconds of no-stress: {len(stress.loc[(stress.ID == id) & (stress.tag == 3)])}')
    print(f'Seconds of pre-stress: {len(stress.loc[(stress.ID == id) & (stress.tag == 1)])}')
    print(f'Seconds of post-stress: {len(stress.loc[(stress.ID == id) & (stress.tag == 2)])}')

A000C8
Seconds of no-stress: 249733
Seconds of pre-stress: 2023
Seconds of post-stress: 696
A00520
Seconds of no-stress: 194849
Seconds of pre-stress: 19035
Seconds of post-stress: 14240
A00708
Seconds of no-stress: 137051
Seconds of pre-stress: 8547
Seconds of post-stress: 6682
A00BAF
Seconds of no-stress: 41290
Seconds of pre-stress: 4689
Seconds of post-stress: 3815
A00C5A
Seconds of no-stress: 204713
Seconds of pre-stress: 34965
Seconds of post-stress: 31648
A00E0B
Seconds of no-stress: 225284
Seconds of pre-stress: 12799
Seconds of post-stress: 13841
A00F3D1
Seconds of no-stress: 137931
Seconds of pre-stress: 11460
Seconds of post-stress: 8825
A01179
Seconds of no-stress: 281663
Seconds of pre-stress: 10230
Seconds of post-stress: 6591
A016901
Seconds of no-stress: 162336
Seconds of pre-stress: 24707
Seconds of post-stress: 14147
A016902
Seconds of no-stress: 111378
Seconds of pre-stress: 14081
Seconds of post-stress: 43757
A00F3D2
Seconds of no-stress: 198242
Seconds of pre-stres