In [1]:
import numpy as np
from sktime.transformations.panel.rocket import Rocket
import os
from tqdm import tqdm
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import classification_report

In [2]:
# get relative indices
# manually entered via looking at https://raw.githubusercontent.com/google/mediapipe/a908d668c730da128dfa8d9f6bd25d519d006692/mediapipe/modules/face_geometry/data/canonical_face_model_uv_visualization.png
lower = [76, 77, 90, 180, 85, 16, 315, 404, 320, 307]

upper = [184, 74, 73, 72, 11, 302, 303, 304, 408, 306]

u2 = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
l2 = [291, 375, 321, 405, 314, 17, 84, 181, 91, 146]

u3 = [57, 186, 92, 165, 167, 164, 393, 391, 322, 410]
l3 = [287, 273, 335, 406, 313, 18, 83, 182, 106, 43]

combo_indices = lower + upper + u2 + l2 + u3 + l3

In [3]:
opath = "/Users/nraman/Documents/thesis_videos/"
# make sure to get 
subfolders = [i for i in os.listdir(opath) if ".mp4" not in i and ".DS_Store" not in i and ".MOV" not in i]

def find_pointfolder(subfolder, path = "/Users/nraman/Documents/thesis_videos/"):
    cur_path = f"{path}{subfolder}"
    # get names of files within the subfolder
    subfiles = os.listdir(f"{path}{subfolder}")
    nps = [i for i in subfiles if ".npy" in i]
    
    if(len(nps) > 1):
        return("Error: Multiple np arrays saved in the folder")
    else:
        array = np.load(f"{cur_path}/{nps[0]}")
        array = array[:, combo_indices]
        return(array.reshape(array.shape[0], len(combo_indices)*2))

In [4]:
ground_folders = [i for i in subfolders if "ground" in i]
speak_folders = [i for i in subfolders if "ground" not in i]

ground_data = []
speak_data = []

for g in ground_folders:
    ground_data.append(find_pointfolder(g))

for s in speak_folders:
    speak_data.append(find_pointfolder(s))

In [13]:
# each input is list of arrays (each 2d, of dimension #timepoints x #points*2)--ground corresponds to no speech
# get dataframe of dimension #samples x #features
def timeseries_df(groundlist, speaklist, window = 12, num_features = len(combo_indices)*2):
    sz = len(groundlist) + len(speaklist)
    numspeak = len(speak_data)

    df = pd.DataFrame(np.zeros([sz, num_features])).astype(object)

    for j in range(sz):
        for a in range(len(combo_indices)*2):
            if(j < numspeak):
                cur = speaklist[j][:, a]
                # use last 12 frames since Nandita tended to say words towards the end of videos
                df.iloc[j, a] = pd.Series(cur[-window:])
            else:
                it = j - numspeak
                cur = groundlist[it][:, a]
                df.iloc[j, a] = pd.Series(cur[-window:])
    
    true = np.concatenate([np.ones(numspeak), np.zeros(sz - numspeak)])
    
    return df, true

In [14]:
alldata, alltrue = timeseries_df(ground_data, speak_data)

In [15]:
# X_train, X_test, y_train, y_test = train_test_split(alldata, alltrue, test_size = 0.2, shuffle = True)

In [16]:
# # train Rocket embedding
# rocket = Rocket()
# rocket.fit(X_train)
# X_train_transform = rocket.transform(X_train)

In [17]:
# # train logistic regression classifier
# classifier = LDA()
# classifier.fit(X_train_transform, y_train)

In [18]:
# X_test_transform = rocket.transform(X_test)
# print(classifier.score(X_test_transform, y_test))
# print(classifier.predict(X_test_transform))
# print(y_test)


### STRANGE NOTE: 
When timeseries_df looks like the code below, I get classifier accuracies around 60%. However, when I change to cur[:window], I get around 90% accuracy!
         

In [21]:
kf = KFold(n_splits=10)
kf.get_n_splits(X = alldata, y = alltrue)

preds = []
actual = []

for train_index, test_index in tqdm(kf.split(alldata)):
    X_train, X_test = alldata.iloc[train_index,:], alldata.iloc[test_index,:]
    y_train, y_test = alltrue[train_index], alltrue[test_index]
    
    rocket = Rocket()
    rocket.fit(X_train)
    X_train_transform = rocket.transform(X_train)
    
    model = LR(max_iter = 2000)
    model.fit(X_train_transform, y_train)
    
    pred_values = model.predict(rocket.transform(X_test))
    preds.append([int(i) for i in pred_values])
    actual.append([int(i) for i in y_test])

10it [00:08,  1.22it/s]


In [22]:
allpreds = np.concatenate(preds)
allactual = np.concatenate(actual)

print(classification_report(allactual, allpreds, target_names=["Non-Speech", "Speech"]))


              precision    recall  f1-score   support

  Non-Speech       0.82      0.85      0.84        27
      Speech       0.93      0.91      0.92        58

    accuracy                           0.89        85
   macro avg       0.88      0.88      0.88        85
weighted avg       0.90      0.89      0.89        85

