# Interessting notes
- Walking and running is easily classified by Kmeans
- going up and down stairs is harder to classify
  - Especially when going down stairs!
  - We cannot explain this yet
- Overal accuracy is 0.82, but RF is way better for this relativly simple dataset (1.0)

In [119]:
# ------ Imports ------ #
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from util import computeFeatureImportance
from sklearn.tree import DecisionTreeClassifier

In [120]:
# Setting seed and random state to be able to reproduce results
np.random.seed(42)

In [121]:
# ------ Settings ------ #
n_clusters = 4
train_size = 0.8

In [None]:
# ------ Data import ------ #
print("Importing data...")
x = pd.read_csv(r'Data Gathering and Preprocessing/features_Walking_scaled.csv')
print("Data imported")
print(f"Shape of data: {x.shape}")
print("Done")

In [None]:
# ------ shuffling + train, test split ------ #
print("shuffling data and splitting data into train and test...")
train, test = train_test_split(x, train_size=train_size, shuffle=True)
print("Done")

In [None]:
# ------ x, y split ------ #

print("Splitting data into x and y...")
le = LabelEncoder()
le.fit(train["label"])
print(f"Classes: {le.classes_}")

y_train = le.transform(train["label"])
x_train = train.copy()
x_train = x_train.drop(["label", "time", "ID"], axis=1)

y_test = le.transform(test["label"])
x_test = test.copy()
x_test = x_test.drop(["label", "time", "ID"], axis=1)
print("Done")

In [None]:
# ------ PCA ------ #

print("starting PCA...")
pca = PCA(2)
df = pca.fit_transform(x_train)
df_test = pca.fit_transform(x_test)

x_train_pca = np.array(df)
x_test_pca = np.array(df_test)
print("Done")

In [None]:
# ------ Training KMeans ------ #

print("Training KMeans...")
model = KMeans(n_clusters=n_clusters)
model.fit(x_train)
label = model.labels_
print("Training done")

pred_y = model.predict(x_test)
print(f"KMeans accuracy: {accuracy_score(y_test, pred_y)}")

model = KMeans(n_clusters=n_clusters)
model.fit(df)
label = model.labels_

In [None]:
# ------ centroid ------ #

print("Calculating centroids...")
centroids = model.cluster_centers_
u_labels = np.unique(label)

cdict = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'brown', 5: 'purple', 6: 'orange', 7: 'pink'}
ldict = {}
for i in range(len(u_labels)):
    ldict[i] = le.classes_[i]
    
print("predicting...")
pred = model.predict(df_test)
print("Done")

In [None]:
print("Plotting model and test data...")

fig, axs = plt.subplots(2, 2)

axs[0, 0].title.set_text('Model')
for l in u_labels:
    ii = np.where(label == l)
    axs[0, 0].scatter(x_train_pca[ii, 0], x_train_pca[ii, 1], c=cdict[l], label=ldict[l])
# axs[0, 0].scatter(df[:, 0], df[:, 1], c=label)

axs[0, 0].scatter(centroids[:,0] , centroids[:,1] , s = 80, c="black", marker='x')

# ------ prediction test data ------ #
# Make predictions on the test data

axs[0, 1].title.set_text('new data points')
axs[0, 1].scatter(df_test[:, :1], df_test[:, 1:], c='grey')
axs[0, 1].scatter(centroids[:,0] , centroids[:,1] , s = 80, c="black", marker='x')

# create second plot which show new points whichout prediction
axs[1, 0].title.set_text('New data on model')
for l in u_labels:
    ii = np.where(label == l)
    axs[1, 0].scatter(x_train_pca[ii, 0], x_train_pca[ii, 1], c=cdict[l], label=ldict[l])
axs[1, 0].scatter(df_test[:, :1], df_test[:, 1:], c='grey')
axs[1, 0].scatter(centroids[:,0] , centroids[:,1] , s = 80, c="black", marker='x')

# create third plot which show the predictions of the new points
axs[1, 1].title.set_text('result')
for l in u_labels:
    ii = np.where(label == l)
    axs[1, 1].scatter(x_train_pca[ii, 0], x_train_pca[ii, 1], c=cdict[l], label=ldict[l])
for l in u_labels:
    ii = np.where(pred == l)
    axs[1, 1].scatter(x_test_pca[ii, 0], x_test_pca[ii, 1], c=cdict[l])
axs[1, 1].scatter(centroids[:,0] , centroids[:,1] , s = 80, c="black", marker='x')
plt.show()

print("plotting model vs actual...")
fig, axs = plt.subplots(2)
axs[0].title.set_text('model result')
for l in u_labels:
    ii = np.where(label == l)
    axs[0].scatter(x_train_pca[ii, 0], x_train_pca[ii, 1], c=cdict[l], label=ldict[l])
axs[0].scatter(centroids[:,0] , centroids[:,1] , s = 80, c="black", marker='x')

axs[1].title.set_text('Actual result')
for l in u_labels:
    ii = np.where(y_train == l)
    axs[1].scatter(x_train_pca[ii, 0], x_train_pca[ii, 1], c=cdict[l], label=ldict[l])
axs[1].scatter(centroids[:,0] , centroids[:,1] , s = 80, c="black", marker='x')
axs[1].legend()
plt.show()

In [None]:
x

In [None]:
model = KMeans(n_clusters=n_clusters)
model.fit(x_train)

In [None]:

x_test_walking = test[test["label"] == "walking"]
y_test_walking = le.transform(x_test_walking["label"])
x_test_walking = x_test_walking.drop(["label", "time", "ID"], axis=1)
print(x_test_walking)
print(y_test_walking)

In [None]:
x_test_running = test[test["label"] == "running"]
y_test_running = le.transform(x_test_running["label"])
x_test_running = x_test_running.drop(["label", "time", "ID"], axis=1)
print(x_test_running)
print(y_test_running)

In [None]:
x_test_stairs_down = test[test["label"] == "stairs_down"]
y_test_stairs_down = le.transform(x_test_stairs_down["label"])
x_test_stairs_down = x_test_stairs_down.drop(["label", "time", "ID"], axis=1)
print(x_test_stairs_down)
print(y_test_stairs_down)

In [None]:
x_test_stairs_up = test[test["label"] == "stairs_up"]
y_test_stairs_up = le.transform(x_test_stairs_up["label"])
x_test_stairs_up = x_test_stairs_up.drop(["label", "time", "ID"], axis=1)
print(x_test_stairs_up)
print(y_test_stairs_up)

In [None]:
y_pred_walking = model.predict(x_test_walking)
print(y_pred_walking)

In [None]:
y_pred_running = model.predict(x_test_running)
print(y_pred_running)

In [None]:
y_pred_stairs_down = model.predict(x_test_stairs_down)
print(y_pred_stairs_down)

In [None]:
y_pred_stairs_up = model.predict(x_test_stairs_up)
print(y_pred_stairs_up)

In [None]:
unique, counts = np.unique(y_pred_walking, return_counts=True)

count_dict_walking = dict(zip(unique, counts))
print("walking")
print(count_dict_walking)

In [None]:
unique, counts = np.unique(y_pred_running, return_counts=True)

count_dict_running = dict(zip(unique, counts))
print("running")
print(count_dict_running)

In [None]:
unique, counts = np.unique(y_pred_stairs_up, return_counts=True)

count_dict_stairs_up = dict(zip(unique, counts))
print("stairs_up")
print(count_dict_stairs_up)

In [None]:
unique, counts = np.unique(y_pred_stairs_down, return_counts=True)

count_dict_stairs_down = dict(zip(unique, counts))
print("stairs_down")
print(count_dict_stairs_down)

In [143]:
transform_dict = {0: "walking", 1: "running", 3: "stairs_up", 2: "stairs_down"}

In [None]:
walking_acc = count_dict_walking[0] / (count_dict_walking[0] + count_dict_walking[2])
print("walking_acc")
print(walking_acc)

In [None]:
running_acc = count_dict_running[1] / (count_dict_running[1])
print("running_acc")
print(running_acc)

In [None]:
stairs_up_acc = count_dict_stairs_up[3] / (count_dict_stairs_up[2] + count_dict_stairs_up[0] + count_dict_stairs_up[3])
print("stairs_up_acc")
print(stairs_up_acc)

In [None]:
stairs_down_acc = count_dict_stairs_down[2] / (count_dict_stairs_down[3] + count_dict_stairs_down[2])
print("stairs_down_acc")
print(stairs_down_acc)

In [None]:
# Using best case scenario! (walking = 0, running = 1, stairs_up = 3, stairs_down = 2)
print("Accuracy of kmeans with walking dataset:")
print((walking_acc * len(y_pred_walking) + running_acc * len(y_pred_running) + stairs_up_acc * len(y_pred_stairs_up) + stairs_down_acc * len(y_pred_stairs_down)) / (len(y_pred_walking) + len(y_pred_running) + len(y_pred_stairs_up) + len(y_pred_stairs_down)))

In [None]:
print("Training Random Forest model...")
rf = RF()
rf.fit(x_train, y_train)

y_pred_train = rf.predict(x_train)
accuracy_train = accuracy_score(y_train, y_pred_train)

y_pred_test = rf.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred_test)

print(f"rf: {accuracy_train=}, {accuracy_test=}")