In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
df_raw = pd.read_csv("CoST.csv")
# df = pd.read_csv("https://data.4tu.nl/articles/dataset/Corpus_of_Social_Touch_CoST_/12696869?file=24044075", sep='\t')


# Preprocessing

frame = 135 - 1 second

In [2]:
df = df_raw.copy()
df.columns = df.columns.str.strip(" ")
df = df.set_index(['subject', 'variant', 'gesture'])
df['frame'].value_counts()

5       7805
1       7805
7       7805
2       7805
8       7805
        ... 
1658       1
1659       1
1580       1
1660       1
1535       1
Name: frame, Length: 1747, dtype: int64

In [3]:
df['observation'] = np.nan
df = df.reset_index()
values = df['frame'].values
i = 1
for index, element in tqdm(enumerate(values)):
    if values[index + 1] < values[index]:
        df.loc[index, "observation"] = i
        i += 1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




IndexError: index 1496855 is out of bounds for axis 0 with size 1496855

In [4]:
df['observation'] = df['observation'].fillna(method='bfill')
df['observation'] = df['observation'].fillna(df['observation'].max() + 1)
df['observation'] = df['observation'].astype(int)

In [5]:
# Create ML dataset

data = pd.DataFrame(index=np.arange(1, df['observation'].max() + 1).astype(int), columns=['duration'])

# Add gesture (y)
data['gesture'] = pd.Series(df.drop_duplicates(['observation'], keep='last')['gesture'].values)

dict_gesture = {1: "grab", 2: "hit", 3: "massage", 4: "pat", 5: "pinch",
                6: "poke", 7: "press", 8: "rub", 9: "scratch", 10: "slap", 11: "squeeze",
                12: "stroke", 13: "tap", 14: "tickle"}

data['gesture'] = data['gesture'].map(dict_gesture)

data['variant'] = pd.Series(df.drop_duplicates(['observation'], keep='last')['variant'].values)
dict_variant = {1: "gentle", 2: "normal", 3: "rough"}
data['variant'] = data['variant'].map(dict_variant)


# Add duration for every observation
data['duration'] = (df.drop_duplicates(['observation'], keep='last')['frame'] / 135).values

Some unique combination of subject, variant, gesture have 5 repetitions. 7805 gesture captures

# Feature engineering

– Mean pressure is the mean over channels and time (1).

– Maximum pressure is the maximum value over channels
and time (2).

– Pressure variability is the mean over time of the sum over
channels of the absolute value of difference between two
consecutive frames (3).

– Mean pressure per row is the mean over columns and time
resulting in one feature per row which are in the direction
of the mannequin arm’s length (from top to bottom, 4–
11).

– Mean pressure per column is the mean over rows and
time resulting in one feature per column which are in
the direction of the mannequin arm’s width (from left to
right, 12–19).

– Contact area per frame is the fraction of channels with a
value above 50 % of the maximum value. Mean contact
area is the mean over time of contact area (20) and the
maximum pressure contact area is the contact area of the
frame with the highest mean pressure over channels (21).
The size of the contact area indicated whether the whole
hand was used for a touch gesture, as would be expected

In [6]:
# Mean pressure
ch_cols = [i for i in df.columns if i.startswith("ch")]
print(ch_cols[:10])
data['mean_pressure'] = df.groupby('observation')[ch_cols].mean().mean(axis=1)

['ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8', 'ch9', 'ch10']


In [7]:
data.groupby(['gesture'])['mean_pressure'].mean()

gesture
grab       341.931508
hit        127.462967
massage    207.492889
pat        128.959427
pinch      148.889580
poke       118.970094
press      201.823134
rub        164.400707
scratch    138.925870
slap       120.755853
squeeze    285.747048
stroke     149.573673
tap        117.651540
tickle     127.242386
Name: mean_pressure, dtype: float64

In [9]:
# Maximum pressure
data['maximum_pressure'] = df.groupby('observation')[ch_cols].max().max(axis=1)

In [10]:
data.groupby(['gesture'])['maximum_pressure'].mean() * 2

gesture
grab       1708.850987
hit        1689.304659
massage    1693.558348
pat        1513.917415
pinch      1691.086022
poke       1534.254480
press      1703.835125
rub        1595.583483
scratch    1539.315412
slap       1579.863799
squeeze    1756.057451
stroke     1539.482014
tap        1491.870968
tickle     1444.904847
Name: maximum_pressure, dtype: float64

In [11]:
# Variance over channels and time (44)
data['variance'] = df.groupby('observation')[ch_cols].var().var(axis=1)

In [12]:
# Contact area per frame
# Attention! It's really time and memory expensive
df['contact_area'] = df[ch_cols].apply(lambda x: np.mean(x > x.max() * 0.5), axis=1)
data['mean_contact_area'] = df.groupby('observation')['contact_area'].mean()

In [13]:
data.to_csv("final_dataset.csv")
data

Unnamed: 0,duration,gesture,variant,mean_pressure,maximum_pressure,variance,mean_contact_area
1,0.474074,grab,gentle,136.772461,439,1.346562e+07,0.309570
2,0.496296,grab,gentle,193.445196,594,1.043202e+08,0.410215
3,0.496296,grab,gentle,112.382929,515,1.193715e+07,0.233442
4,0.651852,grab,gentle,156.530717,657,1.455307e+08,0.208452
5,0.896296,grab,gentle,156.828771,786,6.985941e+08,0.309788
...,...,...,...,...,...,...,...
7801,2.896296,tickle,rough,171.512228,853,8.934892e+07,0.210358
7802,4.029630,tickle,rough,141.360754,864,3.718105e+07,0.226218
7803,3.207407,tickle,rough,162.377057,885,1.146045e+08,0.205362
7804,2.962963,tickle,rough,149.128516,890,1.321851e+08,0.176523


## Splitting into training and test set. Checking accuracy and other evaluation metrics without parameters tuning and cross-validation.

In [14]:
# dict_gesture_inverse, changing gesture values to numerical so that each number corresponds to specific gesture
dict_gesture_inverse = {value: key for key, value in dict_gesture.items()}
data = data.dropna()
data['gesture'] = data['gesture'].map(dict_gesture_inverse)
data

Unnamed: 0,duration,gesture,variant,mean_pressure,maximum_pressure,variance,mean_contact_area
1,0.474074,1,gentle,136.772461,439,1.346562e+07,0.309570
2,0.496296,1,gentle,193.445196,594,1.043202e+08,0.410215
3,0.496296,1,gentle,112.382929,515,1.193715e+07,0.233442
4,0.651852,1,gentle,156.530717,657,1.455307e+08,0.208452
5,0.896296,1,gentle,156.828771,786,6.985941e+08,0.309788
...,...,...,...,...,...,...,...
7800,3.518519,14,rough,175.249474,809,7.624782e+07,0.223158
7801,2.896296,14,rough,171.512228,853,8.934892e+07,0.210358
7802,4.029630,14,rough,141.360754,864,3.718105e+07,0.226218
7803,3.207407,14,rough,162.377057,885,1.146045e+08,0.205362


In [15]:
#Splitting data into training/test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['gesture', 'variant']),
                                                    data['gesture'],
                                                    test_size=0.3)

In [16]:
#Importing different evaluation metrics
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             roc_auc_score, f1_score)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer

pipe_svm = Pipeline([
    ('scal', StandardScaler()),
    ('model', SVC(decision_function_shape='ovo'))
])
pipe_svm.fit(X_train, y_train)

In [18]:
from sklearn.ensemble import RandomForestClassifier
pipe_forest = Pipeline([
    ('scal', StandardScaler()),
    ('model', RandomForestClassifier())
])
pipe_forest.fit(X_train, y_train)

Pipeline(steps=[('scal', StandardScaler()),
                ('model', RandomForestClassifier())])

In [19]:
def metrics(y_true, model):
    dict_metrics = {}
    dict_metrics['accuracy'] = accuracy_score(y_true, model.predict(X_test))
    dict_metrics['precision'] = precision_score(y_true, model.predict(X_test), average='micro')
    dict_metrics['recall'] = recall_score(y_true, model.predict(X_test), average='micro')
    return dict_metrics
metrics(y_test, pipe_forest)

{'accuracy': 0.40905209222886424,
 'precision': 0.40905209222886424,
 'recall': 0.40905209222886424}

## Adding complex preprocessing and cross-validation. Checking performance of different classifiers using Randomized Grid Search.

In [20]:
#RANDOM FOREST CLASSIFIER
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_std=False, with_mean=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X_train.columns),
        ])

pipe_forest = Pipeline([('scl', preprocessor),
                        ('clf', RandomForestClassifier(n_jobs=-1, oob_score=False))
                        ])  # pipeline with all steps
param_dist_forest = {'clf__max_depth': [3, 5, 6, 10, 15],
                     'clf__n_estimators': [100, 200, 300, 400, 500],
                     'clf__max_features': ['sqrt', 'log2'],
                     'clf__min_samples_leaf': np.arange(1, 30)
                     }  # parameters for Grid
# Cross-validation choice
skf = KFold(5)
forest_randomized_pipe = RandomizedSearchCV(estimator=pipe_forest,
                                            param_distributions=param_dist_forest,
                                            cv=skf, n_iter=30, n_jobs=-1)
forest_randomized_pipe.fit(X_train, y_train)
print(forest_randomized_pipe.score(X_test, y_test))
print(f"Best parameters are: {forest_randomized_pipe.best_params_}")

0.4218616567036721
Best parameters are: {'clf__n_estimators': 100, 'clf__min_samples_leaf': 11, 'clf__max_features': 'log2', 'clf__max_depth': 15}


In [21]:
#SUPPORT VECTOR MACHINE
pipe_svm = Pipeline([
    ('scal', StandardScaler()),
    ('model', SVC(decision_function_shape='ovo'))])
param_svm1 = {'model__C': [0.001, 0.01, 0.1, 1, 10, 100],'model__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
skf1 = KFold(5)
svm_randomized_pipe = RandomizedSearchCV(estimator=pipe_svm, param_distributions=
                                         param_svm1, cv=skf1, n_iter=30, n_jobs=-1)
svm_randomized_pipe.fit(X_train, y_train)
print(svm_randomized_pipe.score(X_test, y_test))
print(svm_randomized_pipe.best_params_)


0.43381725021349277
{'model__gamma': 0.1, 'model__C': 100}


In [26]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
pipe_log = Pipeline([
    ("scal1", StandardScaler()),
    ("model1", LogisticRegression(multi_class="ovr"))])
param_log = {"model1__C": [0.001, 0.01, 0.1, 1, 10, 100]}
skf2 = KFold(5)
log_randomized_pipe = RandomizedSearchCV(estimator=pipe_log, param_distributions=
                                        param_log, cv=skf2, n_iter=30)
log_randomized_pipe.fit(X_train, y_train)
print(log_randomized_pipe.score(X_test, y_test))
print(log_randomized_pipe.best_params_)

0.35055508112724165
{'model1__C': 100}


In [22]:
#MULTILAYER PERCEPTRON CLASSIFIER 
from sklearn.neural_network import MLPClassifier
pipe_mlp = Pipeline([("scal2", StandardScaler()),
                    ("model2", MLPClassifier(max_iter=200))])
param_mlp = {"model2__solver": ["sgd", "adam"],
              "model2__activation": ["logistic", "relu"],
              "model2__hidden_layer_sizes": [[20,20]]}
skf3 = KFold(5)
mlp_randomized_pipe = RandomizedSearchCV(estimator=pipe_mlp, param_distributions=param_mlp, cv=skf3)
mlp_randomized_pipe.fit(X_train, y_train)
print(mlp_randomized_pipe.score(X_test, y_test))
print(mlp_randomized_pipe.best_params_)

0.41588385994876176
{'model2__solver': 'adam', 'model2__hidden_layer_sizes': [20, 20], 'model2__activation': 'relu'}


## Analyze quality of predictions depending on gesture name

In [33]:
data_predictions = data.loc[X_test.index]
data_predictions['y_predict'] = forest_randomized_pipe.predict(X_test)
data_predictions['y_true'] = y_test.values
data_predictions['correct'] = np.where(data_predictions['y_true'] == data_predictions['y_predict'], 1, 0)

gesture_correcter = data_predictions.groupby(['gesture'])['correct'].mean().sort_values(ascending=False).to_frame()\
#     .join(data_predictions['gesture'].value_counts(normalize=True).to_frame("gesture_num_obs"))
gesture_correcter.index = gesture_correcter.index.map(dict_gesture)
gesture_correcter.rename(columns={"correct": "correct(RandomForestClassifier)"}, inplace=True)
gesture_correcter



Unnamed: 0_level_0,correct(RandomForestClassifier)
gesture,Unnamed: 1_level_1
grab,0.715152
massage,0.602484
hit,0.557471
poke,0.55625
slap,0.54878
tickle,0.545455
press,0.415094
pinch,0.329545
pat,0.311765
scratch,0.289773


In [34]:
print(f"Accuracy of Random Forest Classifier is: {forest_randomized_pipe.score(X_test, y_test)}")

Accuracy of Random Forest Classifier is: 0.39795046968403075


In [44]:
data_predictions1 = data.loc[X_test.index]
data_predictions1['y_predict'] = svm_randomized_pipe.predict(X_test)
data_predictions1['y_true'] = y_test.values
data_predictions1['correct'] = np.where(data_predictions1['y_true'] == data_predictions1['y_predict'], 1, 0)

gesture_correcter1 = data_predictions1.groupby(['gesture'])['correct'].mean().sort_values(ascending=False).to_frame()\
#     .join(data_predictions['gesture'].value_counts(normalize=True).to_frame("gesture_num_obs"))
gesture_correcter1.index = gesture_correcter1.index.map(dict_gesture)
gesture_correcter1.rename(columns={"correct": "correct(SVM)"}, inplace=True)
gesture_correcter1

Unnamed: 0_level_0,correct(SVM)
gesture,Unnamed: 1_level_1
grab,0.636364
massage,0.583851
poke,0.58125
slap,0.579268
tickle,0.577922
hit,0.545977
pinch,0.414773
press,0.352201
stroke,0.35
pat,0.335294


In [45]:
print(f"Accuracy of Support Vector Machine is: {forest_randomized_pipe.score(X_test, y_test)}")

Accuracy of Support Vector Machine is: 0.39795046968403075


In [46]:
data_predictions2 = data.loc[X_test.index]
data_predictions2['y_predict'] = log_randomized_pipe.predict(X_test)
data_predictions2['y_true'] = y_test.values
data_predictions2['correct'] = np.where(data_predictions2['y_true'] == data_predictions2['y_predict'], 1, 0)

gesture_correcter2 = data_predictions2.groupby(['gesture'])['correct'].mean().sort_values(ascending=False).to_frame()\
#     .join(data_predictions['gesture'].value_counts(normalize=True).to_frame("gesture_num_obs"))
gesture_correcter2.index = gesture_correcter2.index.map(dict_gesture)
gesture_correcter2.rename(columns={"correct": "correct(LOGREG)"}, inplace=True)
gesture_correcter2

Unnamed: 0_level_0,correct(LOGREG)
gesture,Unnamed: 1_level_1
grab,0.672727
hit,0.62069
tickle,0.597403
poke,0.58125
massage,0.559006
slap,0.445122
press,0.396226
pinch,0.357955
squeeze,0.19375
stroke,0.1375


In [47]:
print(f"Accuracy of Logistic Regression is: {forest_randomized_pipe.score(X_test, y_test)}")

Accuracy of Logistic Regression is: 0.39795046968403075


In [48]:
data_predictions3 = data.loc[X_test.index]
data_predictions3['y_predict'] = mlp_randomized_pipe.predict(X_test)
data_predictions3['y_true'] = y_test.values
data_predictions3['correct'] = np.where(data_predictions2['y_true'] == data_predictions3['y_predict'], 1, 0)

gesture_correcter3 = data_predictions3.groupby(['gesture'])['correct'].mean().sort_values(ascending=False).to_frame()\
#     .join(data_predictions['gesture'].value_counts(normalize=True).to_frame("gesture_num_obs"))
gesture_correcter3.index = gesture_correcter3.index.map(dict_gesture)
gesture_correcter3.rename(columns={"correct": "correct(Multilayer perceptron)"}, inplace=True)
gesture_correcter3

Unnamed: 0_level_0,correct(Multilayer perceptron)
gesture,Unnamed: 1_level_1
grab,0.690909
poke,0.59375
slap,0.585366
massage,0.583851
tickle,0.5
hit,0.482759
stroke,0.36875
press,0.36478
pinch,0.346591
pat,0.311765


In [67]:
from functools import reduce
dfs = [gesture_correcter, gesture_correcter1, gesture_correcter2, gesture_correcter3]
df_final = reduce(lambda left,right: pd.merge(left,right,on='gesture'), dfs)
df_final.columns
df_final["maximum_accuracy"] = df_final[['correct(RandomForestClassifier)', 'correct(SVM)'
                                ,'correct(LOGREG)','correct(Multilayer perceptron)']].max(axis =1)
df_final["minimum_accuracy"] = df_final[['correct(RandomForestClassifier)', 'correct(SVM)'
                                ,'correct(LOGREG)','correct(Multilayer perceptron)']].min(axis =1)
df_final


Unnamed: 0_level_0,correct(RandomForestClassifier),correct(SVM),correct(LOGREG),correct(Multilayer perceptron),maximum_accuracy,minimum_accuracy
gesture,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
grab,0.715152,0.636364,0.672727,0.690909,0.715152,0.636364
massage,0.602484,0.583851,0.559006,0.583851,0.602484,0.559006
hit,0.557471,0.545977,0.62069,0.482759,0.62069,0.482759
poke,0.55625,0.58125,0.58125,0.59375,0.59375,0.55625
slap,0.54878,0.579268,0.445122,0.585366,0.585366,0.445122
tickle,0.545455,0.577922,0.597403,0.5,0.597403,0.5
press,0.415094,0.352201,0.396226,0.36478,0.415094,0.352201
pinch,0.329545,0.414773,0.357955,0.346591,0.414773,0.329545
pat,0.311765,0.335294,0.1,0.311765,0.335294,0.1
scratch,0.289773,0.272727,0.085227,0.227273,0.289773,0.085227


In [79]:
df_max = df_final[["maximum_accuracy"]]
df_max["algorithm"] = ["rfc", "rfc", "logreg", "mlp", "mlp", "logreg", "rfc", "svm", "svm", "rfc", "mlp", "smv", "mlp", "svm"]
df_min = df_final[["minimum_accuracy"]]
df_min["algorithm"] = ["svm", "logreg", "mlp", "rfc", "logreg", "mlp", "svm", "rfc", "logreg", "logreg", "logreg", 
                       "logreg", "logreg", "rfc"]



Unnamed: 0_level_0,maximum_accuracy,algorithm
gesture,Unnamed: 1_level_1,Unnamed: 2_level_1
grab,0.715152,rfc
massage,0.602484,rfc
hit,0.62069,logreg
poke,0.59375,mlp
slap,0.585366,mlp
tickle,0.597403,logreg
press,0.415094,rfc
pinch,0.414773,svm
pat,0.335294,svm
scratch,0.289773,rfc


In [81]:
df_min.to_csv("/Users/nikitaksnv/Desktop/dfmin.csv")
df_max.to_csv("/Users/nikitaksnv/Desktop/dfmax.csv")