In [ ]:
import pandas as pd
import os
import numpy as np

In [ ]:
path = os.path.abspath('')
path = path + "/engineered_data_250ms_window80_step8.csv"
df = pd.read_csv(path)

diction = {'Dany':0, 'Felix':1, 'Julian':2, 'Mark':3,'Martin':4,'Michele':5,'Paul':6}

for index,row in df['Participant'].items():
    df.loc[index,'Participant'] = diction[row]

List = ["Window", "Participant", "Run", "Path"]
for name in df.columns:
    if "Acceleration" in name and not "Linear" in name:
        List.append(name)

df.drop(df.columns.difference(List),axis=1,inplace=True)
print(df.columns)

df['Path'] = df['Path'].replace({'straight': 0, 'circle': 1})
unique_values = df['Path'].unique()
print("Unique values in the 'path' column:", unique_values)
sample_values = df['Path'].sample(n=10, random_state=1)  # random_state ensures reproducibility
print("Sample of the 'path' column:\n", sample_values)

In [ ]:
#create train/test set
#should maybe do random runs in the future
X_Train = df[df['Run'] < 4].copy()
X_Test = df[df['Run'] == 4].copy()

# Extracting targets
Y_Train = X_Train.pop('Participant')
Y_Test = X_Test.pop('Participant')

# Verifying the final shapes and content of datasets
print("X_Train shape:", X_Train.shape)
print("X_Test shape:", X_Test.shape)
print("Y_Train shape:", Y_Train.shape)
print("Y_Test shape:", Y_Test.shape)

#Display a few rows to confirm correct data setup
print(X_Train['Path'].sample(3))
print(X_Test['Path'].sample(3))

# In order to solve the NaN issue, i try to locate the values
print("NaN in X_Train:", X_Train.isnull().sum().sum())
print("NaN in Y_Train:", Y_Train.isnull().sum().sum())
print("NaN in X_Test:", X_Test.isnull().sum().sum())

# Find rows with NaN values in X_Train
nan_rows_X_Train = X_Train[X_Train.isnull().any(axis=1)]
print("Rows with NaN values in X_Train:")
print(nan_rows_X_Train)

# Find rows with NaN values in X_Test
nan_rows_X_Test = X_Test[X_Test.isnull().any(axis=1)]
print("Rows with NaN values in X_Test:")
print(nan_rows_X_Test)

# Print the number of unique rows that have NaN in X_Train and X_Test
print("Number of unique rows with NaN in X_Train:", nan_rows_X_Train.shape[0])
print("Number of unique rows with NaN in X_Test:", nan_rows_X_Test.shape[0])

# In order to solve the NaN issue in the SVM I will drop the the NaN values
# Remove rows with NaN values from X_Train and X_Test
X_Train_cleaned = X_Train.dropna()
X_Test_cleaned = X_Test.dropna()

# Ensure the target variables are aligned
Y_Train_cleaned = Y_Train[X_Train_cleaned.index]
Y_Test_cleaned = Y_Test[X_Test_cleaned.index]

# Ensure Y_Train_cleaned and Y_Test_cleaned is of integer type
Y_Train_cleaned = Y_Train_cleaned.astype(int)
Y_Test_cleaned = Y_Test_cleaned.astype(int)

In [ ]:
# Check for NaN values in Y_Train
print("NaN in Y_Train:", Y_Train_cleaned.isnull().sum().sum())
print("NaN in Y_Train:", Y_Test_cleaned.isnull().sum().sum())

# Check the unique values in Y_Train
print("Unique values in Y_Train:", Y_Train_cleaned.unique())
print("Unique values in Y_Train:", Y_Test_cleaned.unique())



In [ ]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_Train_scaled = scaler.fit_transform(X_Train_cleaned)

# Transform the test data
X_Test_scaled = scaler.transform(X_Test_cleaned)

# Using the scaled data for training and prediction
# Define and train the RBF kernel SVM model
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1)
rbf.fit(X_Train_scaled, Y_Train_cleaned)

# Define and train the polynomial kernel SVM model with degree 20
poly = svm.SVC(kernel='poly', degree=20, C=1)
poly.fit(X_Train_scaled, Y_Train_cleaned)

# Define and train the linear kernel SVM model
clf = svm.SVC(kernel='linear', C=1)
clf.fit(X_Train_scaled, Y_Train_cleaned)

# Make predictions on the test set using all three models
y_pred_rbf = rbf.predict(X_Test_scaled)
y_pred_poly = poly.predict(X_Test_scaled)
y_pred_linear = clf.predict(X_Test_scaled)

print("RBF Kernel SVM Predictions:", y_pred_rbf)
print("Polynomial Kernel SVM Predictions:", y_pred_poly)
print("Linear Kernel SVM Predictions:", y_pred_linear)

In [ ]:
# Print the predictions and accuracy from all three models
print("Accuracy (RBF):", metrics.accuracy_score(Y_Test_cleaned, y_pred_rbf))

print("Accuracy (Polynomial):", metrics.accuracy_score(Y_Test_cleaned, y_pred_poly))

print("Accuracy (Linear):", metrics.accuracy_score(Y_Test_cleaned, y_pred_linear))

In [ ]:
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
model = XGBClassifier(tree_method = 'hist',objective='multi:softprob')
model.fit(X_Train,Y_Train)
y_pred = model.predict(X_Test)
#Y_Test = Y_Test.astype('int')
print("Accuracy:",metrics.accuracy_score(Y_Test , y_pred))

cm = confusion_matrix(Y_Test,y_pred)
sns.heatmap(cm,annot=True)

In [ ]:
import time
from sklearn.inspection import permutation_importance

start_time = time.time()

result = permutation_importance(model, X_Test, Y_Test, n_repeats=10, random_state=42, n_jobs=2)

elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: "

        f"{elapsed_time:.3f} seconds")

sorted_idx = result.importances_mean.argsort()

plt.figure(figsize=(12, 6))

columns = columns.str.replace('median_','').str.replace('get_','').str.replace('_',' ')

plt.boxplot(result.importances[sorted_idx].T, vert=False, labels=columns[sorted_idx])

plt.title("Permutation Importance (test set)")

plt.tight_layout()

plt.show()

# plot the permutation importance with correct naming

feature_importance = result.importances_mean

feature_importance = 100.0 * (feature_importance / feature_importance.max())

sorted_idx = np.argsort(feature_importance)

pos = np.arange(sorted_idx.shape[0]) + .5


plt.figure(figsize=(12, 6))

plt.barh(pos, feature_importance[sorted_idx], align='center')

plt.yticks(pos, columns[sorted_idx])

plt.xlabel('Relative Importance')

plt.title('Permutation Importance (test set)')

plt.show()



In [ ]:
df

In [ ]:
# print the number of unique runs per individual
df.groupby(['Participant','Path'])['Run'].nunique()

In [ ]:
df['Run'].unique()

In [ ]:
from itertools import combinations

# a function to implement cross validation given the number of runs we want to use for training (we have 4 in total)
def cross_validation(df, seed, model, training_runs = 3):
    runs = df['Run'].unique() # typically [1,2,3,4] in our case


    if training_runs >= len(runs):
        raise ValueError("The number of training runs should be less than the total number of runs")

    # all possible combinations of runs given the number of runs we want to use for training


    training_combinations = list(combinations(runs, training_runs))

    print(f"Number of training combinations: {len(training_combinations)}")

    print(f"The combinations: {training_combinations}")

    # we will store the accuracy for each combination of runs


    accuracies = []

    for training_set in training_combinations:


        X_Train = df[df['Run'].isin(training_set)].copy()

        X_Test = df[~df['Run'].isin(training_set)].copy()

        Y_Train = X_Train['Participant'].copy()

        Y_Test = X_Test['Participant'].copy()

        X_Test.drop(['Participant'],axis=1,inplace=True)

        X_Train.drop(['Participant'],axis=1,inplace=True)

        model.fit(X_Train,Y_Train)

        y_pred = model.predict(X_Test)

        Y_Test = Y_Test.astype('int')

        accuracies.append(metrics.accuracy_score(Y_Test , y_pred))

    return accuracies









In [ ]:

# convert the path column to an int representation

df_copy = df.copy(deep=True)

df_copy.loc[df_copy['Path'] == 'circle', "Path"] = 1

df_copy.loc[df_copy['Path'] == 'straight',"Path"] = 0

df_copy['Path'] = df_copy['Path'].astype(int)






In [ ]:
model = XGBClassifier(tree_method = 'hist',objective='multi:softprob')


accuracies = cross_validation(df_copy, 42, model, 2)

In [ ]:
accuracies

In [ ]:
# plot the mean accuracy and the standard deviation for different number of runs and with or without the path column
# create a stripplot to show the distribution of the accuracies with seaborn

import seaborn as sns


df_without_path = df.copy(deep=True)

df_without_path.drop(['Path'],axis=1,inplace=True)

results_list = []

# we will use 2, 3, and 4 runs for training
for training_runs in [1,2, 3]:
    # Assuming cross_validation is a function that returns a list of accuracies
    accuracies = cross_validation(df_copy, 42, model, training_runs)
    for acc in accuracies:
        results_list.append({'Training runs': training_runs, 'accuracy': acc, 'Path': 'yes'})

    accuracies = cross_validation(df_without_path, 42, model, training_runs)
    for acc in accuracies:
        results_list.append({'Training runs': training_runs, 'accuracy': acc, 'Path': 'no'})

# Convert the list of dictionaries to a dataframe
results = pd.DataFrame(results_list)
# plot the results

plt.figure(figsize=(12, 6))

sns.stripplot(x='Training runs', y='accuracy', hue='Path', data=results, jitter=True, dodge=True)

plt.title("Accuracy for different number of training runs and with or without the path column")

plt.show()



In [ ]:
plt.figure(figsize=(12, 6))

sns.stripplot(x='Number of runs used for training', y='accuracy', hue='Path used', data=results, jitter=True, dodge=True)

plt.title("Accuracy for different number of training runs and with or without the path column")

plt.show() 
