In [1]:
# Using Python 3.8.12
import pandas as pd # pandas 1.3.3
from pycaret.classification import * # pycaret 2.3.6

In [2]:
# Import feature set; using a copy of the features from 032422. Code to generate these features
# can be found in the data_processing/make_features folder in this branch "compute_data_features".
df_main = pd.read_csv("~/human-activity-recognition/features_032422.csv")

# Making dictionary between incident id and category (i.e. slip, trip, fall, or other). 
# File path is to the raw data which the features were made from.
df_raw = pd.read_csv("~/human-activity-recognition/raw-data/har_raw.gz")

incident_dict = dict() # Dictionary with key = incident_id, and value = motion

for i in range(df_raw.shape[0]):
    incident_dict[df_raw.iloc[i,1]] = df_raw.iloc[i,2]

# Features that are going to be "ignored". Most of these features are based in the y-direction
# because the team found that models would correlate a "misoriented" wearable with a specific
# type of motion. (Alden mentioned it before in a meeting)
filter_feat = ["mean y","window -7480:-2480 mean y","window -4980:20 mean y","window -2480:2520 mean y",
               "window 20:5020 mean y","window 2520:7520 mean y","peak y"]

df_main.drop(filter_feat, axis=1, inplace=True) # remove features from the DataFrame
    
# z-score normalizes each feature to make sure the scales are the same (assume normal distribution)
for feature in df_main.columns:
    # Please do not normalize the incident_id values!
    if feature == "incident_id":
        continue
    df_main[feature] = (df_main[feature] - df_main[feature].mean())/df_main[feature].std()

# Separated dataframes for STF (slip, trip, falls) and Others
others = [ID for ID, motion in incident_dict.items() if motion == "other"]
stf = list(set(df_main["incident_id"])-set(others)) # Slip, trip, or fall IDs

df_stf = df_main[df_main["incident_id"].isin(stf)].reset_index().drop(labels=["index","incident_id"],axis=1)
df_other =df_main[df_main["incident_id"].isin(others)].reset_index().drop(labels=["index","incident_id"],axis=1)

# Remove the dataframes that will not be used (just to be clean)
del others; del stf; del df_main; del df_raw;

In [3]:
# Combine the two DataFrames (STF and Others) to feed into the PyCaret workflow
df = pd.concat([df_stf,df_other])

# Add column ["is_STF"] to DataFrame. If value = 1, it is a slip/trip/fall. If value = 0, it is other.
df["is_STF"] = np.array([1 for i in range(df_stf.shape[0])]+[0 for j in range(df_other.shape[0])])

print("Number of slip, trip, or fall events:", df_stf.shape[0])
print("Number of other events:", df_other.shape[0])

Number of slip, trip, or fall events: 298
Number of other events: 2472


In [4]:
# Initial setup of the PyCaret models
# This implements SMOTE for the class imbalance, and performs a Yeo-Johnson transformation then normalization.
# Also tries to use PyCaret's built in feature selection workflow to reduce feature space
clf1 = setup(data = df, target = 'is_STF',
             fix_imbalance = True,
             transformation = True,
             normalize=True,
             feature_selection = True,
             feature_selection_threshold = 0.5,
             silent=True)

Unnamed: 0,Description,Value
0,session_id,3471
1,Target,is_STF
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(2770, 102)"
5,Missing Values,0
6,Numeric Features,101
7,Categorical Features,0
8,Ordinal Features,0
9,High Cardinality Features,0


In [5]:
# Training an SVM using k-fold validation
svm = create_model('svm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.5515,0.0,0.4,0.0964,0.1553,-0.013,-0.0191
1,0.5412,0.0,0.7,0.1443,0.2393,0.0825,0.1356
2,0.5567,0.0,0.7,0.1489,0.2456,0.0911,0.1462
3,0.5155,0.0,0.25,0.0595,0.0962,-0.0844,-0.1252
4,0.5206,0.0,0.55,0.1158,0.1913,0.0253,0.0409
5,0.4691,0.0,0.55,0.1048,0.176,0.0034,0.006
6,0.6495,0.0,0.45,0.1364,0.2093,0.0607,0.0786
7,0.5773,0.0,0.15,0.0441,0.0682,-0.1084,-0.1425
8,0.513,0.0,0.4,0.0889,0.1455,-0.029,-0.0452
9,0.544,0.0,0.3684,0.0843,0.1373,-0.0273,-0.0411


In [7]:
# Tune the hyperparameters of the SVM using 'precision' as the model performance metric
# to optimize for
tuned_svm = tune_model(svm, optimize='precision',n_iter=100)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.4948,0.0,0.6,0.1176,0.1967,0.0294,0.0504
1,0.5619,0.0,0.8,0.1649,0.2735,0.1237,0.2034
2,0.5052,0.0,0.75,0.1415,0.2381,0.0782,0.1387
3,0.5,0.0,0.65,0.1262,0.2114,0.0468,0.0809
4,0.5258,0.0,0.7,0.14,0.2333,0.0743,0.1252
5,0.5103,0.0,0.8,0.1495,0.252,0.0947,0.1694
6,0.5052,0.0,0.6,0.12,0.2,0.034,0.0573
7,0.4948,0.0,0.5,0.102,0.1695,-0.0021,-0.0035
8,0.5285,0.0,0.55,0.1183,0.1947,0.0291,0.0464
9,0.5337,0.0,0.5263,0.1099,0.1818,0.0226,0.0363


In [8]:
# Seeing how the SVM is not doing that well in terms of precision. Let's see if other models
# perform any better

best = compare_models(sort='precision')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8901,0.5502,0.02,0.2,0.0364,0.0164,0.0316,1.21
rf,Random Forest Classifier,0.8694,0.5686,0.05,0.1537,0.0731,0.0194,0.0257,5.38
lightgbm,Light Gradient Boosting Machine,0.8509,0.5915,0.0853,0.1354,0.1034,0.0278,0.0287,1.929
nb,Naive Bayes,0.5717,0.5741,0.5224,0.1239,0.2002,0.0411,0.0611,0.065
ada,Ada Boost Classifier,0.6497,0.563,0.3676,0.116,0.1762,0.0243,0.0316,3.035
lr,Logistic Regression,0.6026,0.5383,0.4208,0.1119,0.1766,0.0181,0.027,0.514
lda,Linear Discriminant Analysis,0.5897,0.5381,0.4363,0.1115,0.1775,0.0174,0.0265,0.177
knn,K Neighbors Classifier,0.4773,0.523,0.5866,0.1114,0.1871,0.0178,0.0317,0.47
gbc,Gradient Boosting Classifier,0.7497,0.5487,0.2113,0.1095,0.143,0.0133,0.0151,14.206
ridge,Ridge Classifier,0.5908,0.0,0.4211,0.1083,0.1722,0.0116,0.0187,0.125
