In [50]:
import pandas as pd

# File paths and columns to be loaded
csv_files = ['ampc2/Boning.csv', 'ampc2/Slicing.csv']
selected_columns = [f'{part} {coord}' for part in ['Neck', 'Head'] for coord in ['x', 'y', 'z']] + ['Frame']

# Load and label data
boning_data = pd.read_csv(csv_files[0], usecols=selected_columns).assign(label=0)
slicing_data = pd.read_csv(csv_files[1], usecols=selected_columns).assign(label=1)

# Combine the datasets and save to CSV
combined_data = pd.concat([boning_data, slicing_data], ignore_index=True)
combined_data.to_csv('combined_data.csv', index=False)

# Display first few rows
combined_data.head()

Unnamed: 0,Frame,Neck x,Neck y,Neck z,Head x,Head y,Head z,label
0,0,0.207796,0.127939,-0.17513,0.376399,0.202993,-0.182585,0
1,1,-0.006589,0.356974,0.286768,0.204439,0.521502,0.198235,0
2,2,0.112606,0.043502,0.104975,0.021196,0.19739,0.165812,0
3,3,-0.031866,0.037024,0.131005,-0.157759,0.118886,0.201893,0
4,4,0.135369,0.019024,0.11565,0.011714,0.096737,0.107186,0


# 2 - Create composite columns

Column set 1 - Neck

In [51]:
import numpy as np

# Root mean square of x and y
combined_data['rmsq_neck_xy'] = np.sqrt(np.mean(combined_data[['Neck x', 'Neck y']] ** 2, axis=1))

# Root mean square of y and z
combined_data['rmsq_neck_yz'] = np.sqrt(np.mean(combined_data[['Neck y', 'Neck z']] ** 2, axis=1))

# Root mean square of x and z
combined_data['rmsq_neck_xz'] = np.sqrt(np.mean(combined_data[['Neck z', 'Neck x']] ** 2, axis=1))

# Root mean square of x, y, and z
combined_data['rmsq_neck_xyz'] = np.sqrt(np.mean(combined_data[['Neck x', 'Neck y', 'Neck z']] ** 2, axis=1))

# Neck roll and pitch value
combined_data['neck_roll'] = 180 * np.arctan2(combined_data['Neck y'], np.sqrt(combined_data['Neck x'] ** 2 + combined_data['Neck z'] ** 2)) / np.pi

combined_data['neck_pitch'] = 180 * np.arctan2(combined_data['Neck x'], np.sqrt(combined_data['Neck y'] ** 2 + combined_data['Neck z'] ** 2)) / np.pi

Column set 2 - Head

In [52]:
# Root mean square of x and y
combined_data['rmsq_head_xy'] = np.sqrt(np.mean(combined_data[['Head x', 'Head y']] ** 2, axis=1))

# Root mean square of y and z
combined_data['rmsq_head_yz'] = np.sqrt(np.mean(combined_data[['Head y', 'Head z']] ** 2, axis=1))

# Root mean square of x and z
combined_data['rmsq_head_xz'] = np.sqrt(np.mean(combined_data[['Head z', 'Head x']] ** 2, axis=1))

# Root mean square of x, y, and z
combined_data['rmsq_head_xyz'] = np.sqrt(np.mean(combined_data[['Head x', 'Head y', 'Head z']] ** 2, axis=1))

# Head roll and pitch value
combined_data['head_roll'] = 180 * np.arctan2(combined_data['Head y'], np.sqrt(combined_data['Head x'] ** 2 + combined_data['Head z'] ** 2)) / np.pi

combined_data['head_pitch'] = 180 * np.arctan2(combined_data['Head x'], np.sqrt(combined_data['Head y'] ** 2 + combined_data['Head z'] ** 2)) / np.pi

In [53]:
combined_data

Unnamed: 0,Frame,Neck x,Neck y,Neck z,Head x,Head y,Head z,label,rmsq_neck_xy,rmsq_neck_yz,rmsq_neck_xz,rmsq_neck_xyz,neck_roll,neck_pitch,rmsq_head_xy,rmsq_head_yz,rmsq_head_xz,rmsq_head_xyz,head_roll,head_pitch
0,0,0.207796,0.127939,-0.175130,0.376399,0.202993,-0.182585,0,0.172551,0.153360,0.192158,0.173415,25.210615,43.774063,0.302393,0.193059,0.295816,0.268465,25.883962,54.044176
1,1,-0.006589,0.356974,0.286768,0.204439,0.521502,0.198235,0,0.252462,0.323780,0.202829,0.264392,51.216647,-0.824380,0.396081,0.394501,0.201361,0.343053,61.363191,20.124758
2,2,0.112606,0.043502,0.104975,0.021196,0.197390,0.165812,0,0.085360,0.080349,0.108857,0.092362,15.778909,44.740429,0.140379,0.182286,0.118201,0.149338,49.740164,4.700431
3,3,-0.031866,0.037024,0.131005,-0.157759,0.118886,0.201893,0,0.034541,0.096263,0.095335,0.080723,15.355297,-13.174411,0.139681,0.165672,0.181175,0.163077,24.891210,-33.953692
4,4,0.135369,0.019024,0.115650,0.011714,0.096737,0.107186,0,0.096661,0.082876,0.125896,0.103379,6.098993,49.113563,0.068903,0.102095,0.076243,0.083634,41.897439,4.638284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72055,17875,-0.437480,-0.084730,0.148065,-0.069184,-0.255526,0.166269,1,0.315094,0.120628,0.326582,0.271103,-10.395619,-68.696856,0.187190,0.215568,0.127341,0.180486,-54.824814,-12.785975
72056,17876,-0.557902,-0.231495,0.425979,-0.672785,-0.735978,0.756098,1,0.427109,0.342818,0.496343,0.426733,-18.252333,-49.009317,0.705090,0.746106,0.715655,0.722493,-36.024157,-32.522342
72057,17877,-0.696096,-0.620162,0.365172,-1.484151,-0.951434,0.884484,1,0.659224,0.508897,0.555833,0.578072,-38.271464,-44.045316,1.246581,0.918569,1.221682,1.138748,-28.841039,-48.804871
72058,17878,-0.653691,-0.568779,0.072045,-1.560290,-0.535261,0.274542,1,0.612708,0.405401,0.465028,0.502000,-40.855447,-48.747421,1.166407,0.425369,1.120241,0.965468,-18.668157,-68.916081


In [54]:
# Move the 'label' column to the last position
combined_data = combined_data[[col for col in combined_data.columns if col != 'label'] + ['label']]

combined_data.head()

Unnamed: 0,Frame,Neck x,Neck y,Neck z,Head x,Head y,Head z,rmsq_neck_xy,rmsq_neck_yz,rmsq_neck_xz,rmsq_neck_xyz,neck_roll,neck_pitch,rmsq_head_xy,rmsq_head_yz,rmsq_head_xz,rmsq_head_xyz,head_roll,head_pitch,label
0,0,0.207796,0.127939,-0.17513,0.376399,0.202993,-0.182585,0.172551,0.15336,0.192158,0.173415,25.210615,43.774063,0.302393,0.193059,0.295816,0.268465,25.883962,54.044176,0
1,1,-0.006589,0.356974,0.286768,0.204439,0.521502,0.198235,0.252462,0.32378,0.202829,0.264392,51.216647,-0.82438,0.396081,0.394501,0.201361,0.343053,61.363191,20.124758,0
2,2,0.112606,0.043502,0.104975,0.021196,0.19739,0.165812,0.08536,0.080349,0.108857,0.092362,15.778909,44.740429,0.140379,0.182286,0.118201,0.149338,49.740164,4.700431,0
3,3,-0.031866,0.037024,0.131005,-0.157759,0.118886,0.201893,0.034541,0.096263,0.095335,0.080723,15.355297,-13.174411,0.139681,0.165672,0.181175,0.163077,24.89121,-33.953692,0
4,4,0.135369,0.019024,0.11565,0.011714,0.096737,0.107186,0.096661,0.082876,0.125896,0.103379,6.098993,49.113563,0.068903,0.102095,0.076243,0.083634,41.897439,4.638284,0


# 3 - Data pre-processing and Feature computation

In [55]:
import pandas as pd
import numpy as np
from scipy.signal import find_peaks

# Function to calculate area under the curve
def compute_auc(values):
    return np.trapz(values)

# Function to calculate the number of peaks
def compute_peaks(values):
    peaks, _ = find_peaks(values)
    return len(peaks)

frames_per_minute = 60
total_minutes = len(combined_data) // frames_per_minute

# Dictionary to store new feature columns
feature_columns = {}

# Loop through each column in the DataFrame
for col in combined_data.columns:
    if col not in ['Frame', 'label']:
        # Lists to hold feature values for each minute chunk
        mean_vals = []
        max_vals = []
        min_vals = []
        std_vals = []
        auc_vals = []
        peak_vals = []

        # Iterate over each minute
        for minute in range(total_minutes):
            start_idx = minute * frames_per_minute
            end_idx = (minute + 1) * frames_per_minute

            # Extract data for the current minute chunk
            chunk = combined_data[col][start_idx:end_idx]

            # Compute statistical features
            mean_vals.append(np.mean(chunk))
            max_vals.append(np.max(chunk))
            min_vals.append(np.min(chunk))
            std_vals.append(np.std(chunk))

            # Compute AUC and peaks for the chunk
            auc_vals.append(compute_auc(chunk))
            peak_vals.append(compute_peaks(chunk))

        # Add new features to the dictionary
        feature_columns[f'{col}_mean'] = mean_vals
        feature_columns[f'{col}_max'] = max_vals
        feature_columns[f'{col}_min'] = min_vals
        feature_columns[f'{col}_std'] = std_vals
        feature_columns[f'{col}_auc'] = auc_vals
        feature_columns[f'{col}_peak'] = peak_vals

# Convert the new features dictionary into a DataFrame
new_feature_data = pd.DataFrame(feature_columns)

# Add the 'class' column and 'Minute' column
new_feature_data['label'] = combined_data['label'][::frames_per_minute].reset_index(drop=True)
new_feature_data['Minute'] = range(1, total_minutes + 1)


In [56]:
new_feature_data

Unnamed: 0,Neck x_mean,Neck x_max,Neck x_min,Neck x_std,Neck x_auc,Neck x_peak,Neck y_mean,Neck y_max,Neck y_min,Neck y_std,...,head_roll_auc,head_roll_peak,head_pitch_mean,head_pitch_max,head_pitch_min,head_pitch_std,head_pitch_auc,head_pitch_peak,label,Minute
0,0.058447,0.653929,-0.595560,0.256127,3.318419,16,-0.005770,0.560261,-0.613843,0.268144,...,689.293930,13,14.586659,80.848958,-63.205906,36.779820,841.516871,14,0,1
1,-0.096646,2.575807,-1.595246,0.682678,-5.568238,16,0.009024,0.914943,-1.113462,0.401579,...,-78.316905,14,-14.411278,72.177073,-82.903717,41.752705,-882.430223,11,0,2
2,-0.028674,2.001616,-2.423520,1.069323,-1.552828,13,0.091343,1.689948,-1.483868,0.746604,...,400.618717,15,-12.058287,79.714435,-85.010275,47.965019,-737.609847,13,0,3
3,0.165025,2.492493,-2.141650,1.006900,9.860101,14,-0.277588,2.195484,-2.639818,1.091049,...,-151.323287,12,9.807505,85.879061,-67.321659,35.587808,597.547862,16,0,4
4,-0.146506,3.637092,-3.680950,1.338005,-7.756284,16,0.076010,5.590761,-4.399867,1.600203,...,63.030669,12,-10.568928,64.139626,-81.704511,42.283748,-573.552212,11,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,-0.012817,0.986665,-1.150080,0.460413,-0.552951,11,-0.082524,1.051489,-1.506590,0.572480,...,-439.768695,14,2.469290,83.066818,-80.860638,42.759818,161.311246,13,1,1197
1197,0.058315,0.931917,-0.713570,0.372504,3.841962,11,0.020152,1.097732,-1.065647,0.521761,...,135.719639,13,11.033433,82.781289,-76.724555,40.289374,686.887004,11,1,1198
1198,-0.085338,0.671208,-1.101927,0.359819,-5.145323,14,0.032627,1.670912,-0.911860,0.507928,...,331.105990,13,-8.977715,82.522075,-74.524517,33.946343,-524.173460,14,1,1199
1199,0.046348,2.194297,-1.384220,0.704044,2.384813,12,0.247009,4.532293,-3.360391,1.201012,...,795.360876,12,5.707723,85.639854,-77.079188,42.756966,288.355749,13,1,1200


In [57]:
new_feature_data.to_csv('new_features_per_min.csv', index=False)
new_feature_data.head()

Unnamed: 0,Neck x_mean,Neck x_max,Neck x_min,Neck x_std,Neck x_auc,Neck x_peak,Neck y_mean,Neck y_max,Neck y_min,Neck y_std,...,head_roll_auc,head_roll_peak,head_pitch_mean,head_pitch_max,head_pitch_min,head_pitch_std,head_pitch_auc,head_pitch_peak,label,Minute
0,0.058447,0.653929,-0.59556,0.256127,3.318419,16,-0.00577,0.560261,-0.613843,0.268144,...,689.29393,13,14.586659,80.848958,-63.205906,36.77982,841.516871,14,0,1
1,-0.096646,2.575807,-1.595246,0.682678,-5.568238,16,0.009024,0.914943,-1.113462,0.401579,...,-78.316905,14,-14.411278,72.177073,-82.903717,41.752705,-882.430223,11,0,2
2,-0.028674,2.001616,-2.42352,1.069323,-1.552828,13,0.091343,1.689948,-1.483868,0.746604,...,400.618717,15,-12.058287,79.714435,-85.010275,47.965019,-737.609847,13,0,3
3,0.165025,2.492493,-2.14165,1.0069,9.860101,14,-0.277588,2.195484,-2.639818,1.091049,...,-151.323287,12,9.807505,85.879061,-67.321659,35.587808,597.547862,16,0,4
4,-0.146506,3.637092,-3.68095,1.338005,-7.756284,16,0.07601,5.590761,-4.399867,1.600203,...,63.030669,12,-10.568928,64.139626,-81.704511,42.283748,-573.552212,11,0,5


# 4 - Training

In [58]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
feature_data = pd.read_csv('new_features_per_min.csv')

# Split features and labels
X_features = feature_data.drop(['label', 'Minute'], axis=1)
y_labels = feature_data['label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.3, random_state=42)


In [59]:
clf = svm.SVC()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
result = accuracy_score(y_test, y_pred)

In [60]:
ori_acc = result
f"Test accuracy: {result*100:2f}%"

'Test accuracy: 77.562327%'

In [61]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
clf = svm.SVC()
scores = cross_val_score(clf, X_features, y_labels, cv=10)
# print (max(scores))

In [62]:
ori_cv_acc = np.mean(scores)

In [63]:
from sklearn.model_selection import GridSearchCV

# Update SVC to use RBF kernel
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1]
}
grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Update SVM model with optimal parameters
clf = svm.SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'])
clf.fit(X_train, y_train)

# Train-test split accuracy
y_pred = clf.predict(X_test)
accuracy_split = accuracy_score(y_test, y_pred)
print("Train-test split accuracy:", accuracy_split)

# 10-fold cross-validation accuracy
scores = cross_val_score(clf, X_features, y_labels, cv=10)
print("10-fold cross-validation scores:", scores)

Best parameters: {'C': 0.1, 'gamma': 0.001}
Train-test split accuracy: 0.7728531855955678
10-fold cross-validation scores: [0.75206612 0.75833333 0.75833333 0.75       0.75       0.75
 0.75       0.75       0.75       0.75      ]


In [64]:
hyper_train_test_acc = accuracy_split
hyper_cv_acc = np.mean(scores)

In [65]:
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np

k = 10
selector = SelectKBest(score_func=f_classif, k=k)
X_new = selector.fit_transform(X_features, y_labels)

# Train the final model with the best parameters
svm_model = svm.SVC(kernel='rbf', C=best_params['C'], gamma=best_params['gamma'])
svm_model.fit(X_train, y_train)

# Step 5: Evaluate the model on the test set
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy}")

# Step 6: Perform 10-fold cross-validation with hyperparameter tuning
cv_scores = cross_val_score(svm_model, X_new, y_labels, cv=10)
print(f"10-Fold Cross-Validation Accuracy: {np.mean(cv_scores)}")


Test Set Accuracy: 0.7728531855955678
10-Fold Cross-Validation Accuracy: 0.7593595041322313


In [66]:
feature_selection_train_test_acc = accuracy
feature_selection_cv_acc = np.mean(cv_scores)

In [67]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# Step 1: Perform PCA to reduce to 10 principal components
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_features)

# Step 2: Split the dataset into 70/30 train/test set
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_labels, test_size=0.3, random_state=42)

# Step 3: Hyperparameter tuning values (assuming these are obtained from a previous activity)
# Example hyperparameters
param_grid = {
    'C': [1, 10, 100],
    'gamma': [0.001, 0.01, 0.1],
    'kernel': ['rbf']
}

# Step 4: Train an SVM model with hyperparameter tuning on the training set
grid_search = GridSearchCV(SVC(), param_grid, cv=10)
grid_search.fit(X_train, y_train)

# Best parameters from grid search
best_params = grid_search.best_params_

# Train the final model with the best parameters
svm_model = SVC(**best_params)
svm_model.fit(X_train, y_train)

# Step 5: Evaluate the model on the test set
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy}")

# Step 6: Perform 10-fold cross-validation with hyperparameter tuning
cv_scores = cross_val_score(SVC(**best_params), X_pca, y_labels, cv=10)
print(f"10-Fold Cross-Validation Accuracy: {np.mean(cv_scores)}")

Test Set Accuracy: 0.7728531855955678
10-Fold Cross-Validation Accuracy: 0.7518732782369145


In [68]:
pca_train_test_acc = accuracy
pca_cv_acc = np.mean(cv_scores)

In [69]:
import pandas as pd


# Create a dictionary with the accuracy values
data = {
    "SVM Model": [
        "Original features",
        "With hyperparameter tuning",
        "With feature selection and hyperparameter tuning",
        "With PCA and hyperparameter tuning"
    ],
    "Train-test split Accuracy": [
        f"{ori_acc * 100:.2f}%",
        f"{hyper_train_test_acc * 100:.2f}%",
        f"{feature_selection_train_test_acc * 100:.2f}%",
        f"{pca_train_test_acc * 100:.2f}%"
    ],
    "Cross-validation Accuracy": [
        f"{ori_cv_acc * 100:.2f}%",
        f"{hyper_cv_acc * 100:.2f}%",
        f"{feature_selection_cv_acc * 100:.2f}%",
        f"{pca_cv_acc * 100:.2f}%"
    ]
}

# Convert the dictionary to a DataFrame
summary_df = pd.DataFrame(data)

# Display the DataFrame
display(summary_df)

Unnamed: 0,SVM Model,Train-test split Accuracy,Cross-validation Accuracy
0,Original features,77.56%,75.27%
1,With hyperparameter tuning,77.29%,75.19%
2,With feature selection and hyperparameter tuning,77.29%,75.94%
3,With PCA and hyperparameter tuning,77.29%,75.19%


In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('combined_data.csv')
X = data.drop('label', axis=1)  # Replace 'target' with the actual target column name
y = data['label']  # Replace 'target' with the actual target column name

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Function to train and evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    train_test_acc = accuracy_score(y_test, y_pred)
    cv_acc = cross_val_score(model, X, y, cv=10).mean()
    return train_test_acc, cv_acc

# Train with SGDClassifier
sgd_model = SGDClassifier(random_state=42)
sgd_train_test_acc, sgd_cv_acc = evaluate_model(sgd_model, X_train, X_test, y_train, y_test)

# Train with RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_train_test_acc, rf_cv_acc = evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# Train with MLPClassifier
mlp_model = MLPClassifier(random_state=42)
mlp_train_test_acc, mlp_cv_acc = evaluate_model(mlp_model, X_train, X_test, y_train, y_test)

svm_train_test_acc = result # Accuracy of SVM model above
svm_cv_acc = max(scores)  # Accuracy of SVM model above using cross validation

# Create a summary table
data = {
    "Model": ["SVM", "SGD", "RandomForest", "MLP"],
    "Train-test split Accuracy": [
        f"{svm_train_test_acc * 100:.2f}%",
        f"{sgd_train_test_acc * 100:.2f}%",
        f"{rf_train_test_acc * 100:.2f}%",
        f"{mlp_train_test_acc * 100:.2f}%"
    ],
    "Cross-validation Accuracy": [
        f"{svm_cv_acc * 100:.2f}%",
        f"{sgd_cv_acc * 100:.2f}%",
        f"{rf_cv_acc * 100:.2f}%",
        f"{mlp_cv_acc * 100:.2f}%"
    ]
}
summary_df = pd.DataFrame(data)

In [71]:
display(summary_df)

Unnamed: 0,Model,Train-test split Accuracy,Cross-validation Accuracy
0,SVM,77.56%,75.83%
1,SGD,75.11%,72.63%
2,RandomForest,86.73%,51.11%
3,MLP,74.45%,72.06%
