# 1 - Data collection and combination

In [1]:
import pandas as pd

file_paths = ['data/boning.csv', 'data/slicing.csv']

cols_to_read = [f'Right Hand {i}' for i in ['x', 'y', 'z']] + [f'Left Hand {i}' for i in ['x', 'y', 'z']] + ['Frame']

boning_df = pd.read_csv(file_paths[0], usecols=cols_to_read)
boning_df['class'] = 0
slicing_df = pd.read_csv(file_paths[1], usecols=cols_to_read)
slicing_df['class'] = 1

df = pd.concat([boning_df, slicing_df], ignore_index=True)

df.to_csv('data/combined_data.csv', index=False)

slicing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Frame         17880 non-null  int64  
 1   Right Hand x  17880 non-null  float64
 2   Right Hand y  17880 non-null  float64
 3   Right Hand z  17880 non-null  float64
 4   Left Hand x   17880 non-null  float64
 5   Left Hand y   17880 non-null  float64
 6   Left Hand z   17880 non-null  float64
 7   class         17880 non-null  int64  
dtypes: float64(6), int64(2)
memory usage: 1.1 MB


# 2 - Create composite columns

## Column set 1 - Right hand

In [2]:
import numpy as np

# ------- Column set 1 - Right hand -------

# Root mean square of x and y
df['rmsq_right_xy'] = np.sqrt( np.mean(df[['Right Hand x', 'Right Hand y']] ** 2, axis=1))

# Root mean square of y and z
df['rmsq_right_yz'] = np.sqrt( np.mean(df[['Right Hand y', 'Right Hand z']] ** 2, axis=1))

# Root mean square of x and z
df['rmsq_right_xz'] = np.sqrt( np.mean(df[['Right Hand z', 'Right Hand x']] ** 2, axis=1))

# Root mean square of x, y and z
df['rmsq_right_xyz'] = np.sqrt( np.mean(df[['Right Hand x', 'Right Hand y', 'Right Hand z']] ** 2, axis=1))

# Right hand roll and pitch value
df['right_hand_roll'] = 180 * np.arctan2(df['Right Hand y'], np.sqrt(df['Right Hand x'] ** 2 + df['Right Hand z'] ** 2)) / np.pi

df['right_hand_pitch'] = 180 * np.arctan2(df['Right Hand x'], np.sqrt(df['Right Hand y'] ** 2 + df['Right Hand z'] ** 2)) / np.pi

## Column set 2 - Left hand

In [3]:
# ------- Column set 2 - Left hand -------

# Root mean square of x and y
df['rmsq_left_xy'] = np.sqrt( np.mean(df[['Left Hand x', 'Left Hand y']] ** 2, axis=1))

# Root mean square of y and z
df['rmsq_left_yz'] = np.sqrt( np.mean(df[['Left Hand y', 'Left Hand z']] ** 2, axis=1))

# Root mean square of x and z
df['rmsq_left_xz'] = np.sqrt( np.mean(df[['Left Hand z', 'Left Hand x']] ** 2, axis=1))

# Root mean square of x, y and z
df['rmsq_left_xyz'] = np.sqrt( np.mean(df[['Left Hand x', 'Left Hand y', 'Left Hand z']] ** 2, axis=1))

# left hand roll and pitch value
df['left_hand_roll'] = 180 * np.arctan2(df['Left Hand y'], np.sqrt(df['Left Hand x'] ** 2 + df['Left Hand z'] ** 2)) / np.pi

df['left_hand_pitch'] = 180 * np.arctan2(df['Left Hand x'], np.sqrt(df['Left Hand y'] ** 2 + df['Left Hand z'] ** 2)) / np.pi

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72060 entries, 0 to 72059
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Frame             72060 non-null  int64  
 1   Right Hand x      72060 non-null  float64
 2   Right Hand y      72060 non-null  float64
 3   Right Hand z      72060 non-null  float64
 4   Left Hand x       72060 non-null  float64
 5   Left Hand y       72060 non-null  float64
 6   Left Hand z       72060 non-null  float64
 7   class             72060 non-null  int64  
 8   rmsq_right_xy     72060 non-null  float64
 9   rmsq_right_yz     72060 non-null  float64
 10  rmsq_right_xz     72060 non-null  float64
 11  rmsq_right_xyz    72060 non-null  float64
 12  right_hand_roll   72060 non-null  float64
 13  right_hand_pitch  72060 non-null  float64
 14  rmsq_left_xy      72060 non-null  float64
 15  rmsq_left_yz      72060 non-null  float64
 16  rmsq_left_xz      72060 non-null  float6

In [5]:
df.head()

Unnamed: 0,Frame,Right Hand x,Right Hand y,Right Hand z,Left Hand x,Left Hand y,Left Hand z,class,rmsq_right_xy,rmsq_right_yz,rmsq_right_xz,rmsq_right_xyz,right_hand_roll,right_hand_pitch,rmsq_left_xy,rmsq_left_yz,rmsq_left_xz,rmsq_left_xyz,left_hand_roll,left_hand_pitch
0,0,0.311465,-0.329472,0.750763,0.906499,-0.024053,0.779686,0,0.320595,0.57974,0.574742,0.506362,-22.065225,20.801413,0.641217,0.551584,0.845473,0.690466,-1.152438,49.287457
1,1,0.563723,-0.088187,1.033415,0.917992,0.172597,0.860215,0,0.40346,0.733391,0.832385,0.681544,-4.284295,28.524936,0.660492,0.620387,0.889573,0.733137,7.811881,46.296499
2,2,0.474087,-0.922834,0.802289,0.813233,0.054823,0.709743,0,0.733615,0.864665,0.658948,0.757199,-44.720146,21.191281,0.576348,0.503359,0.763244,0.62399,2.907617,48.80299
3,3,0.690891,-1.622115,0.393867,0.557506,0.006721,0.910102,0,1.246713,1.180336,0.562344,1.043027,-63.882655,22.484355,0.394245,0.643557,0.754685,0.61621,0.360782,31.489924
4,4,0.179927,-1.985673,1.390812,0.409958,-0.113903,0.978904,0,1.409835,1.714243,0.991648,1.403523,-54.767962,4.244606,0.300865,0.69686,0.750439,0.61625,-6.125888,22.586656


# 3 - Data pre-processing and feature computation  

In [6]:
import pandas as pd
import numpy as np

from scipy.signal import find_peaks
from scipy import integrate 

def calculate_auc(y):
     return np.trapz(y) # calculate the area under the curve using the composite trapezoidal rule

def calculate_peaks(y):
    peaks, _ = find_peaks(y)
    return len(peaks)

frames_per_minutes = 60

number_of_minutes = len(df) // frames_per_minutes

new_columns = {}

for column in df.columns:
    if column not in ['Frame', 'class']:

        # Initialize lists to hold the new features values for each chunk
        mean_values = []
        max_values = []
        min_values = []
        std_values = []
        auc_values = []
        peak_values = []

        # Loop through each chunk
        for i in range(number_of_minutes):
            start = i * frames_per_minutes
            end = (i + 1) * frames_per_minutes

            # Calculate the mean, max, min and std values for the chunk
            mean_values.append(np.mean(df[column][start:end]))
            max_values.append(np.max(df[column][start:end]))
            min_values.append(np.min(df[column][start:end]))
            std_values.append(np.std(df[column][start:end]))

            # Calculate the area under the curve for the chunk
            auc_values.append(calculate_auc(df[column][start:end]))
            # auc_values.append(integrate.trapz(df[column][start:end]))

            # Calculate the number of peaks for the chunk
            peak_values.append(calculate_peaks(df[column][start:end]))

        # Add the new features to the new_features_df
        new_columns[f'{column}_mean'] = mean_values
        new_columns[f'{column}_max'] = max_values
        new_columns[f'{column}_min'] = min_values
        new_columns[f'{column}_std'] = std_values
        new_columns[f'{column}_auc'] = auc_values
        new_columns[f'{column}_peak'] = peak_values


#Convert the new columns to a dataframe to avoid fragmentation issues
new_features_df = pd.DataFrame(new_columns)

new_features_df['class'] = df['class'][::frames_per_minutes].reset_index(drop=True)

new_features_df["Minute"] = range(1, number_of_minutes + 1)

In [7]:
new_features_df.to_csv('data/new_features_per_min.csv', index=False)
new_features_df.head()

Unnamed: 0,Right Hand x_mean,Right Hand x_max,Right Hand x_min,Right Hand x_std,Right Hand x_auc,Right Hand x_peak,Right Hand y_mean,Right Hand y_max,Right Hand y_min,Right Hand y_std,...,left_hand_roll_auc,left_hand_roll_peak,left_hand_pitch_mean,left_hand_pitch_max,left_hand_pitch_min,left_hand_pitch_std,left_hand_pitch_auc,left_hand_pitch_peak,class,Minute
0,-0.311031,0.975918,-2.043227,0.669141,-18.759286,11,0.259466,2.644213,-2.158896,1.22522,...,699.334334,10,8.087924,49.287457,-71.899155,30.851966,477.251,12,0,1
1,0.075778,1.635237,-3.111945,0.788082,5.078385,10,-0.002281,2.655147,-2.613755,0.939123,...,-330.119286,10,11.164972,81.295218,-75.865334,40.387145,686.671066,13,0,2
2,-0.11998,5.057009,-3.617915,1.27307,-9.4521,12,0.308408,21.786469,-8.042419,4.464981,...,701.970631,12,-0.198978,81.114644,-84.154037,53.161508,22.806267,13,0,3
3,0.518796,8.306838,-6.908622,3.852492,29.792931,11,-0.840938,5.99083,-7.600437,3.389112,...,200.253636,11,-9.874386,86.032372,-87.75114,52.511539,-544.757462,11,0,4
4,0.017641,19.586288,-9.171689,6.453003,4.14759,13,0.201503,15.643516,-46.409132,10.438612,...,-422.739921,10,1.097838,82.048396,-70.864485,42.093406,100.589707,10,0,5


# 4 - Training  

## 4.1 - Train-test split training

### Data spliting

In [8]:
import pandas as pd

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('data/new_features_per_min.csv')

X = df.drop(['class', 'Minute'], axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


### Train model

In [14]:
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
result = accuracy_score(y_test, y_pred)

In [12]:
f"Training accuracy: {result*100:2f}%"

'Training accuracy: 77.008310%'

## 4.2 - 10-fold Cross-validation 

In [13]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

clf = svm.SVC()

scores = cross_val_score(clf, X, y, cv=10)

result = scores.mean()

f"Cross-validation mean accuracy: {result * 100:.2f}%"

'Cross-validation mean accuracy: 75.69%'

## 4.3 - Train-test split and cross validation with Hyperparameter tuning


### 4.3.1 - Use GridSearchCV to find the best parameters for the SVM model

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C' : [0.1, 1, 10, 100, 1000],
    'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel' : ['rbf', 'linear']
}

# Create a GridSearchCV object
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

# Fit the model
grid.fit(X_train, y_train)

# Get the best parameters
grid.best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.744 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.744 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.744 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.744 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.738 total time=   0.0s


### 4.3.2 - Train-test split model training

In [48]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

best_C = grid.best_params_['C']
best_gamma = grid.best_params_['gamma']
best_kernel = grid.best_params_['kernel']

# Create a new model with the best parameters
clf_hyp = SVC(C=best_C, gamma=best_gamma, kernel='rbf')

# Fit the model
clf_hyp.fit(X_train, y_train)

y_pred_hyp = clf_hyp.predict(X_test)
result = accuracy_score(y_test, y_pred_hyp)

f'Train-test split training accuracy with hyperparameter tuning : {result * 100:.2f}%'

'Train-test split training accuracy with hyperparameter tuning : 77.29%'

### 4.3.3 - Cross validation accuracy score

In [38]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf_hyp, X, y, cv=10)

result = scores.mean()

f'Cross-validation training mean accuracy with hyperparameter tuning : {result * 100:.2f}%'

'Cross-validation training mean accuracy with hyperparameter tuning : 75.19%'

## 4.4 - Train-test split and cross validation with Hyperparameter tuning + Feature selection


### 4.4.1 - Select features and split data based on selected features

In [50]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

df = pd.read_csv('data/new_features_per_min.csv')

X = df.drop(['class', 'Minute'], axis=1)
y = df['class']

# Create a SelectKBest object
selector = SelectKBest(f_classif, k=10)

X_selected = selector.fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

### 4.4.2 - Train-test split model training

In [51]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

clf = SVC(C=best_C, gamma=best_gamma, kernel=best_kernel)

pipeline = Pipeline([
    ('feature_selection', selector),
    ('svc', clf_hyp)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
result = accuracy_score(y_test, y_pred)

f'Train-test split accuracy with selected features : {result * 100:.2f}%'

'Train-test split accuracy with selected features : 77.29%'

### 4.4.3 - Cross validation accuracy score

In [42]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf_hyp, X_selected, y, cv=10)

result = scores.mean()

f'Cross-validation mean accuracy with selected features : {result * 100:.2f}%'

'Cross-validation mean accuracy with selected features : 75.19%'

## 4.5 - Train-test split and cross validation with Hyperparameter tuning + Dimensionality reduction


### 4.5.1 - Preform PCA to reduce dimensionality to 10 first principle components

In [43]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

pca = PCA(n_components=10)

X_pca = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

### 4.5.2 - Train-test split model training

In [45]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

clf = SVC(C=best_C, gamma=best_gamma, kernel=best_kernel)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

result = accuracy_score(y_test, y_pred)

f'Train-test split accuracy with PCA : {result * 100:.2f}%'

'Train-test split accuracy with PCA : 77.29%'

### 4.5.3 - Cross validation accuracy score

In [46]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_pca, y, cv=10)

result = scores.mean()

f'Cross-validation mean accuracy with PCA : {result * 100:.2f}%'

'Cross-validation mean accuracy with PCA : 75.19%'

# 5 - Other models

## Import libraries and load data

In [53]:
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

df = pd.read_csv('data/new_features_per_min.csv')

X = df.drop(['class', 'Minute'], axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 5.1 - SGD ()

In [54]:
sgd_model = SGDClassifier(random_state=42)

sgd_model.fit(X_train, y_train)

y_pred = sgd_model.predict(X_test)

result = accuracy_score(y_test, y_pred)

f'SGDClassifier train-test split accuracy : {result * 100:.2f}%'

'SGDClassifier train-test split accuracy : 73.96%'

In [55]:
scores = cross_val_score(sgd_model, X, y, cv=10)

result = scores.mean()

f'SGDClassifier cross-validation mean accuracy : {result * 100:.2f}%'

'SGDClassifier cross-validation mean accuracy : 74.02%'

## 5.2 - Random Forest Classifier

In [56]:
rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

result = accuracy_score(y_test, y_pred)

f'RandomForestClassifier train-test split accuracy : {result * 100:.2f}%'

'RandomForestClassifier train-test split accuracy : 84.76%'

In [57]:
scores = cross_val_score(rf_model, X, y, cv=10)

result = scores.mean()

f'RandomForestClassifier cross-validation mean accuracy : {result * 100:.2f}%'

'RandomForestClassifier cross-validation mean accuracy : 84.26%'

## 5.3 - MLP Classifier

In [58]:
mlp_model = MLPClassifier(random_state=42) 

mlp_model.fit(X_train, y_train)

y_pred = mlp_model.predict(X_test)

result = accuracy_score(y_test, y_pred)

f'MLPClassifier train-test split accuracy : {result * 100:.2f}%'

'MLPClassifier train-test split accuracy : 77.01%'

In [59]:
scores = cross_val_score(mlp_model, X, y, cv=10)

result = scores.mean()

f'MLPClassifier cross-validation mean accuracy : {result * 100:.2f}%'



'MLPClassifier cross-validation mean accuracy : 75.27%'