In [1]:
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.neighbors import kneighbors_graph

In [2]:
from umap import UMAP

2023-06-07 09:32:39.212154: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-07 09:32:39.280593: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import datasets
from matplotlib.ticker import MaxNLocator
import seaborn as sns

In [3]:
# import tensorflow
from scipy.stats import entropy
from scipy.special import rel_entr
from numpy.random import seed
from numpy.random import normal
from sklearn.cluster import AgglomerativeClustering

In [4]:
datao = pd.read_csv('../datasets/revised_sub20_data_Ishu_modification.csv')

In [5]:
datao.shape

(33232, 19)

In [6]:
datao.outlier.value_counts()

0    32900
1      332
Name: outlier, dtype: int64

In [7]:
datao = datao.iloc[:, 4:]

In [8]:
def normalize_between_a_b(col, a = -1, b = 1):
    min_val = np.min(col)
    max_val = np.max(col)
    new_col = col.map(lambda x: ((b-a) * (x - min_val)/(max_val - min_val)) + a)
    return new_col

In [9]:
cols_to_update = list(datao.columns[:12])

In [10]:
cols_to_update

['attitude.roll',
 'attitude.pitch',
 'attitude.yaw',
 'gravity.x',
 'gravity.y',
 'gravity.z',
 'rotationRate.x',
 'rotationRate.y',
 'rotationRate.z',
 'userAcceleration.x',
 'userAcceleration.y',
 'userAcceleration.z']

In [11]:
datao.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier
0,1.172703,-1.217849,0.633634,0.318633,0.938358,-0.134,-1.221689,0.095745,0.6507,0.59717,0.081509,-0.071948,dws,0,0
1,1.230989,-1.218272,0.70261,0.325523,0.938504,-0.115079,-0.724416,-0.825562,0.181063,0.140922,2.183841,-0.698596,dws,1,0
2,1.220374,-1.217347,0.695971,0.325099,0.938184,-0.118826,0.424864,0.286293,0.057343,0.09145,-0.118314,0.177435,dws,2,0
3,1.196626,-1.215197,0.674119,0.324063,0.937438,-0.127249,0.289479,-0.389842,-0.170267,-0.00102,0.07602,0.182624,dws,3,0
4,1.183103,-1.22057,0.669148,0.317645,0.939295,-0.129714,-0.193776,-0.518626,-0.312347,0.015118,-0.019785,0.194854,dws,4,0


In [12]:
for col in cols_to_update:
    datao[col] = normalize_between_a_b(datao[col])

In [13]:
datao.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier
0,0.373285,-0.474464,0.201808,0.065526,0.839291,0.02328,-0.211991,0.164801,0.21772,0.079346,-0.327072,0.100466,dws,0,0
1,0.391838,-0.475099,0.223767,0.077177,0.839671,0.051275,-0.112647,0.086734,0.137153,-0.069276,0.19877,-0.035517,dws,1,0
2,0.388459,-0.473711,0.221654,0.07646,0.838837,0.045731,0.116955,0.180947,0.115928,-0.085392,-0.377053,0.154583,dws,2,0
3,0.3809,-0.470485,0.214697,0.074708,0.836892,0.033269,0.089908,0.123655,0.076881,-0.115514,-0.328445,0.155709,dws,3,0
4,0.376595,-0.478547,0.213114,0.063856,0.841734,0.029621,-0.006636,0.112743,0.052507,-0.110257,-0.352408,0.158363,dws,4,0


In [14]:
data = datao.iloc[:, :12]

# Hierarchical Clustering

## Making categories

In [15]:
datao

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier
0,0.373285,-0.474464,0.201808,0.065526,0.839291,0.023280,-0.211991,0.164801,0.217720,0.079346,-0.327072,0.100466,dws,0,0
1,0.391838,-0.475099,0.223767,0.077177,0.839671,0.051275,-0.112647,0.086734,0.137153,-0.069276,0.198770,-0.035517,dws,1,0
2,0.388459,-0.473711,0.221654,0.076460,0.838837,0.045731,0.116955,0.180947,0.115928,-0.085392,-0.377053,0.154583,dws,2,0
3,0.380900,-0.470485,0.214697,0.074708,0.836892,0.033269,0.089908,0.123655,0.076881,-0.115514,-0.328445,0.155709,dws,3,0
4,0.376595,-0.478547,0.213114,0.063856,0.841734,0.029621,-0.006636,0.112743,0.052507,-0.110257,-0.352408,0.158363,dws,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33227,-0.832022,-0.888747,-0.127284,-0.538513,0.992321,0.319523,0.077894,0.209239,-0.023768,0.076236,-0.371129,0.062907,jog,55148,0
33228,-0.770531,-0.895371,-0.071727,-0.553875,0.993179,0.301840,0.169089,0.254584,0.074703,0.109312,-0.446234,0.068450,jog,55149,0
33229,-0.720872,-0.915913,-0.030514,-0.549367,0.995518,0.276956,0.187958,0.273587,0.173724,0.093239,-0.574458,0.065414,jog,55150,0
33230,-0.682549,-0.943711,0.001495,-0.530065,0.997906,0.253661,0.174823,0.208143,0.229891,0.095719,-0.661652,0.064099,jog,55151,0


In [48]:
columns_to_categorize = ['attitude.roll', 'attitude.pitch', 'attitude.yaw', 'gravity.x',
       'gravity.y', 'gravity.z', 'rotationRate.x', 'rotationRate.y',
       'rotationRate.z', 'userAcceleration.x', 'userAcceleration.y',
       'userAcceleration.z']
colums_to_not_categorize = ['type', 'row_num', 'outlier']

In [49]:
space_cols = 25
for col in columns_to_categorize:
    print(f"col = {col},{' ' * (space_cols - len(col))}min = {datao[col].min()}, max = {datao[col].max()}")

col = attitude.roll,            min = -1.0, max = 1.0
col = attitude.pitch,           min = -1.0, max = 1.0
col = attitude.yaw,             min = -1.0, max = 1.0
col = gravity.x,                min = -1.0, max = 1.0
col = gravity.y,                min = -1.0, max = 1.0
col = gravity.z,                min = -1.0, max = 1.0
col = rotationRate.x,           min = -1.0, max = 1.0
col = rotationRate.y,           min = -1.0, max = 1.0
col = rotationRate.z,           min = -1.0, max = 1.0
col = userAcceleration.x,       min = -1.0, max = 1.0
col = userAcceleration.y,       min = -1.0, max = 1.0
col = userAcceleration.z,       min = -1.0, max = 1.0


In [50]:
df_4_cats = pd.DataFrame()

In [51]:
for col in columns_to_categorize:
    df_4_cats[col] = pd.cut(datao[col], bins=[-1, -0.5, 0, 0.5, 1.0], labels = ['lowest', 'low', 'mid', 'high'], include_lowest=True)

In [52]:
for col in colums_to_not_categorize:
    df_4_cats[col] = datao[col]

In [53]:
df_4_cats

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33227,lowest,lowest,low,lowest,high,mid,mid,mid,low,mid,low,mid,jog,55148,0
33228,lowest,lowest,low,lowest,high,mid,mid,mid,mid,mid,low,mid,jog,55149,0
33229,lowest,lowest,low,lowest,high,mid,mid,mid,mid,mid,lowest,mid,jog,55150,0
33230,lowest,lowest,mid,lowest,high,mid,mid,mid,mid,mid,lowest,mid,jog,55151,0


In [54]:
# col1 = 'attitude.roll'
# col2 = 'gravity.x'
# y_col = 'type'
# df = df_4_cats
# plt.figure(figsize=(8, 8))
# plt.scatter(df[col1], df[col2], c='r')

# # Create numbered labels for each point
# labels = []
# for index, row in df.iterrows():
#     plt.annotate(str(row[y_col]), xy=(row[col1], row[col2]), xytext=(1, 1), textcoords='offset points')
#     labels.append(row[y_col])
# plt.xlabel(col1)
# plt.ylabel(col2)
# plt.title('motion type')
# plt.locator_params(nbins=10)
# plt.grid()

# plt.show()

We will compare linkage functions for calculating the distance between clusters below:
single, average, weighted, centroid, median, and ward.  

In [55]:
# lat_long= data_scaled.to_numpy()

In [56]:
from scipy.cluster.hierarchy import dendrogram, linkage

In [57]:
# datao[[columns_to_categorize]]

In [58]:
# Z1 = linkage(, method='single', metric='euclidean')

In [59]:
from scipy.cluster.hierarchy import dendrogram, linkage

Z1 = linkage(datao[columns_to_categorize].to_numpy(), method='single', metric='euclidean')
Z2 = linkage(datao[columns_to_categorize].to_numpy(), method='complete', metric='euclidean')
Z3 = linkage(datao[columns_to_categorize].to_numpy(), method='average', metric='euclidean')
Z4 = linkage(datao[columns_to_categorize].to_numpy(), method='ward', metric='euclidean')

### Dendrogram

Notice that each distance method produces different linkages for the same data.

In [60]:
labels = list(datao['type'].values)

In [77]:
# Takes too much time so commenting out

# plt.figure(figsize=(12, 20))
# # plt.subplot(2,2,1), dendrogram(Z1,labels=labels), plt.title('Single') # Max recursion depth exceeded error
# plt.subplot(2,2,2), dendrogram(Z2,labels=labels), plt.title('Complete')
# plt.subplot(2,2,3), dendrogram(Z3,labels=labels), plt.title('Average')
# plt.subplot(2,2,4), dendrogram(Z4,labels=labels), plt.title('Ward')
# plt.show()

### Create Clusters

In [78]:
len(datao['type'].unique())

6

In [79]:
# Creating Dendrogram for our data
# max_d = cut-off/ Threshold value
max_d = len(datao['type'].unique())

# plt.figure(figsize=(10, 8))
# dendrogram(Z4,labels=labels), plt.title('Ward')
  
# # # Cutting the dendrogram at max_d
# plt.axhline(y=max_d, c='k')
# plt.show()

In [80]:
df = df_4_cats

### Flat Clustering

These functions cut hierarchical clusterings into flat clusterings or find the roots of the forest formed by a cut by providing the flat cluster ids of each observation.

In [81]:
from scipy.cluster.hierarchy import fcluster

f1 = fcluster(Z4, max_d, criterion='maxclust')

print(f"Clusters: {f1}")

Clusters: [4 4 4 ... 3 3 3]


In [82]:
np.unique(f1)

array([1, 2, 3, 4, 5, 6], dtype=int32)

### Sklearn Hierarchical Clustering

## Ward with Euclidean (Z4)

In [83]:
Z5 = AgglomerativeClustering(n_clusters=len(datao['type'].unique()), linkage='ward')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[3 3 3 ... 2 2 2]


In [84]:
df['labels'] = Z5.labels_

In [85]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,3
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,3
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,3
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,3
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,3


In [86]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0       1       2       3       4       5
outlier                                                
0        4473.0  7394.0  2952.0  5635.0  5849.0  6597.0
1           0.0    37.0   137.0     0.0     1.0   157.0


In [87]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0       1       2       3       4       5
type                                                  
dws     1006.0  2382.0   217.0  1235.0   619.0   293.0
jog      317.0  1651.0  1367.0   789.0  1056.0   348.0
sit        0.0     0.0     0.0     0.0     0.0   157.0
std        0.0    37.0   137.0     0.0     1.0     0.0
ups     1358.0  1617.0   596.0   929.0  1209.0  1010.0
wlk     1792.0  1744.0   772.0  2682.0  2965.0  4946.0


### Taking sit and stand as one category so 5 clusters

In [88]:
Z5 = AgglomerativeClustering(n_clusters=5, linkage='ward')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[3 3 3 ... 2 2 2]


In [89]:
df['labels'] = Z5.labels_

In [90]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,3
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,3
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,3
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,3
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,3


In [91]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0       1       2       3       4
outlier                                         
0        11070.0  7394.0  2952.0  5635.0  5849.0
1          157.0    37.0   137.0     0.0     1.0


In [92]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0       1       2       3       4
type                                          
dws     1299.0  2382.0   217.0  1235.0   619.0
jog      665.0  1651.0  1367.0   789.0  1056.0
sit      157.0     0.0     0.0     0.0     0.0
std        0.0    37.0   137.0     0.0     1.0
ups     2368.0  1617.0   596.0   929.0  1209.0
wlk     6738.0  1744.0   772.0  2682.0  2965.0


### Taking motion (downstairs, upstairs, run, walk) or no motion (sit and stand) as the only categories so 2 clusters

In [93]:
Z5 = AgglomerativeClustering(n_clusters=2, linkage='ward')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[0 0 0 ... 0 0 0]


In [94]:
df['labels'] = Z5.labels_

In [95]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [96]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0      1
outlier              
0        21830  11070
1          175    157


In [97]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0       1
type                  
dws     4453.0  1299.0
jog     4863.0   665.0
sit        0.0   157.0
std      175.0     0.0
ups     4351.0  2368.0
wlk     8163.0  6738.0


## Average with Euclidean (Z3)

In [98]:
Z5 = AgglomerativeClustering(n_clusters=len(datao['type'].unique()), linkage='average')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[0 0 0 ... 2 2 2]


In [99]:
df['labels'] = Z5.labels_

In [100]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [101]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1       2    3    4    5
outlier                                       
0        29702.0    0.0  3192.0  1.0  3.0  2.0
1           38.0  157.0   137.0  0.0  0.0  0.0


In [102]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1       2    3    4    5
type                                         
dws      5474.0    0.0   278.0  0.0  0.0  0.0
jog      4148.0    0.0  1374.0  1.0  3.0  2.0
sit         0.0  157.0     0.0  0.0  0.0  0.0
std        38.0    0.0   137.0  0.0  0.0  0.0
ups      6055.0    0.0   664.0  0.0  0.0  0.0
wlk     14025.0    0.0   876.0  0.0  0.0  0.0


### Taking sit and stand as one category so 5 clusters

In [103]:
Z5 = AgglomerativeClustering(n_clusters=5, linkage='average')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[0 0 0 ... 0 0 0]


In [104]:
df['labels'] = Z5.labels_

In [105]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [106]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1    2    3    4
outlier                               
0        32894.0    0.0  2.0  1.0  3.0
1          175.0  157.0  0.0  0.0  0.0


In [107]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1    2    3    4
type                                 
dws      5752.0    0.0  0.0  0.0  0.0
jog      5522.0    0.0  2.0  1.0  3.0
sit         0.0  157.0  0.0  0.0  0.0
std       175.0    0.0  0.0  0.0  0.0
ups      6719.0    0.0  0.0  0.0  0.0
wlk     14901.0    0.0  0.0  0.0  0.0


### Taking motion (downstairs, upstairs, run, walk) or no motion (sit and stand) as the only categories so 2 clusters

In [108]:
Z5 = AgglomerativeClustering(n_clusters=2, linkage='average')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[0 0 0 ... 0 0 0]


In [109]:
df['labels'] = Z5.labels_

In [110]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [111]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1
outlier                
0        32900.0    0.0
1          175.0  157.0


In [112]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1
type                  
dws      5752.0    0.0
jog      5528.0    0.0
sit         0.0  157.0
std       175.0    0.0
ups      6719.0    0.0
wlk     14901.0    0.0


## Complete with Euclidean (Z2)

In [113]:
Z5 = AgglomerativeClustering(n_clusters=len(datao['type'].unique()), linkage='complete')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[2 2 2 ... 4 4 4]


In [114]:
df['labels'] = Z5.labels_

In [115]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,2
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,2
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,2
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,2
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,2


In [116]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0       1        2       3       4       5
outlier                                                 
0        5772.0  6553.0  10454.0  4537.0  2036.0  3548.0
1         157.0     0.0      1.0     0.0    33.0   141.0


In [117]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0       1       2       3       4       5
type                                                  
dws      367.0  1022.0  2056.0  1539.0   193.0   575.0
jog      179.0  1056.0  1358.0   975.0  1144.0   816.0
sit      157.0     0.0     0.0     0.0     0.0     0.0
std        0.0     0.0     1.0     0.0    33.0   141.0
ups      980.0  1433.0  2341.0   932.0   377.0   656.0
wlk     4246.0  3042.0  4699.0  1091.0   322.0  1501.0


### Taking sit and stand as one category so 5 clusters

In [118]:
Z5 = AgglomerativeClustering(n_clusters=5, linkage='complete')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[2 2 2 ... 4 4 4]


In [119]:
df['labels'] = Z5.labels_

In [120]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,2
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,2
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,2
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,2
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,2


In [121]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0       1        2       3       4
outlier                                         
0        8085.0  5772.0  10454.0  6553.0  2036.0
1         141.0   157.0      1.0     0.0    33.0


In [122]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0       1       2       3       4
type                                          
dws     2114.0   367.0  2056.0  1022.0   193.0
jog     1791.0   179.0  1358.0  1056.0  1144.0
sit        0.0   157.0     0.0     0.0     0.0
std      141.0     0.0     1.0     0.0    33.0
ups     1588.0   980.0  2341.0  1433.0   377.0
wlk     2592.0  4246.0  4699.0  3042.0   322.0


### Taking motion (downstairs, upstairs, run, walk) or no motion (sit and stand) as the only categories so 2 clusters

In [123]:
Z5 = AgglomerativeClustering(n_clusters=2, linkage='complete')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

[0 0 0 ... 0 0 0]


In [124]:
df['labels'] = Z5.labels_

In [125]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [126]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0     1
outlier             
0        27128  5772
1          175   157


In [127]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0       1
type                   
dws      5385.0   367.0
jog      5349.0   179.0
sit         0.0   157.0
std       175.0     0.0
ups      5739.0   980.0
wlk     10655.0  4246.0


## Digging deep in average linkage with other distance metrics and hyperparameters

#### Mahattan distance (L1)

In [128]:
Z5 = AgglomerativeClustering(n_clusters=2, linkage='average', affinity='l1')
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)



[0 0 0 ... 0 0 0]


In [129]:
df['labels'] = Z5.labels_

In [130]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [131]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1
outlier                
0        32900.0    0.0
1          175.0  157.0


In [132]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1
type                  
dws      5752.0    0.0
jog      5528.0    0.0
sit         0.0  157.0
std       175.0    0.0
ups      6719.0    0.0
wlk     14901.0    0.0


#### Mahattan distance (L1) and connectivity of 2 neighbors

In [133]:
connectivity_2_clusters = kneighbors_graph(datao[columns_to_categorize], n_neighbors=2, include_self=False)

In [134]:
connectivity_2_clusters.data

array([1., 1., 1., ..., 1., 1., 1.])

In [135]:
Z5 = AgglomerativeClustering(n_clusters=2, linkage='average', affinity='l1', connectivity=connectivity_2_clusters)
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

  connectivity, n_connected_components = _fix_connectivity(


[0 0 0 ... 0 0 0]


In [136]:
df['labels'] = Z5.labels_

In [137]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [138]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1
outlier                
0        32900.0    0.0
1          175.0  157.0


In [139]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1
type                  
dws      5752.0    0.0
jog      5528.0    0.0
sit         0.0  157.0
std       175.0    0.0
ups      6719.0    0.0
wlk     14901.0    0.0


#### Mahattan distance (L1) and connectivity of 5 neighbors

In [140]:
connectivity_5_clusters = kneighbors_graph(datao[columns_to_categorize], n_neighbors=5, include_self=False)

In [141]:
connectivity_5_clusters.data

array([1., 1., 1., ..., 1., 1., 1.])

In [142]:
Z5 = AgglomerativeClustering(n_clusters=2, linkage='average', affinity='l1', connectivity=connectivity_5_clusters)
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

  connectivity, n_connected_components = _fix_connectivity(


[0 0 0 ... 0 0 0]


In [143]:
df['labels'] = Z5.labels_

In [144]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [145]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1
outlier                
0        32900.0    0.0
1          175.0  157.0


In [146]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1
type                  
dws      5752.0    0.0
jog      5528.0    0.0
sit         0.0  157.0
std       175.0    0.0
ups      6719.0    0.0
wlk     14901.0    0.0


### 5 neighbors in aggclustering with 5 connectivity

In [147]:
Z5 = AgglomerativeClustering(n_clusters=5, linkage='average', affinity='l1', connectivity=connectivity_5_clusters)
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

  connectivity, n_connected_components = _fix_connectivity(


[0 0 0 ... 0 0 0]


In [148]:
df['labels'] = Z5.labels_

In [149]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [150]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1    2    3    4
outlier                               
0        32896.0    0.0  1.0  1.0  2.0
1          175.0  157.0  0.0  0.0  0.0


In [151]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1    2    3    4
type                                 
dws      5752.0    0.0  0.0  0.0  0.0
jog      5524.0    0.0  1.0  1.0  2.0
sit         0.0  157.0  0.0  0.0  0.0
std       175.0    0.0  0.0  0.0  0.0
ups      6719.0    0.0  0.0  0.0  0.0
wlk     14901.0    0.0  0.0  0.0  0.0


### 5 neighbors in aggclustering with 2 connectivity

In [152]:
Z5 = AgglomerativeClustering(n_clusters=5, linkage='average', affinity='l1', connectivity=connectivity_2_clusters)
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

  connectivity, n_connected_components = _fix_connectivity(


[0 2 0 ... 1 1 1]


In [153]:
df['labels'] = Z5.labels_

In [154]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,2
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [155]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0       1        2    3      4
outlier                                      
0        17210.0  4095.0  11591.0  4.0    0.0
1            1.0   174.0      0.0  0.0  157.0


In [156]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0       1       2    3      4
type                                      
dws     3812.0   163.0  1777.0  0.0    0.0
jog     2398.0  1485.0  1641.0  4.0    0.0
sit        0.0     0.0     0.0  0.0  157.0
std        1.0   174.0     0.0  0.0    0.0
ups     3355.0   723.0  2641.0  0.0    0.0
wlk     7645.0  1724.0  5532.0  0.0    0.0


It is getting there and standing is getting filtered out a bit

## Further refinement

#### connectivity = 2, 5 clusters with L2

In [157]:
Z5 = AgglomerativeClustering(n_clusters=5, linkage='average', affinity='l2', connectivity=connectivity_2_clusters)
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

  connectivity, n_connected_components = _fix_connectivity(


[0 0 0 ... 2 2 2]


In [158]:
df['labels'] = Z5.labels_

In [159]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [160]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1       2       3      4
outlier                                       
0        25002.0    0.0  3318.0  4450.0  130.0
1            1.0  157.0   174.0     0.0    0.0


In [161]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1       2       3      4
type                                         
dws      3901.0    0.0   267.0  1583.0    1.0
jog      2208.0    0.0  1389.0  1829.0  102.0
sit         0.0  157.0     0.0     0.0    0.0
std         1.0    0.0   174.0     0.0    0.0
ups      5025.0    0.0   648.0  1036.0   10.0
wlk     13868.0    0.0  1014.0     2.0   17.0


#### connectivity = 2, 5 clusters with cosine

In [162]:
Z5 = AgglomerativeClustering(n_clusters=5, linkage='average', affinity='cosine', connectivity=connectivity_2_clusters)
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

  connectivity, n_connected_components = _fix_connectivity(


[0 0 0 ... 4 4 4]


In [163]:
df['labels'] = Z5.labels_

In [164]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,0
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [165]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0      1       2       3       4
outlier                                        
0        22762.0    0.0  2451.0  3795.0  3892.0
1            1.0  157.0     0.0     0.0   174.0


In [166]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels        0      1       2       3       4
type                                          
dws      3666.0    0.0   138.0  1650.0   298.0
jog      3989.0    0.0     1.0   139.0  1399.0
sit         0.0  157.0     0.0     0.0     0.0
std         1.0    0.0     0.0     0.0   174.0
ups      3280.0    0.0   634.0  1849.0   956.0
wlk     11827.0    0.0  1678.0   157.0  1239.0


#### connectivity = 2, 5 clusters with manhattan

In [167]:
Z5 = AgglomerativeClustering(n_clusters=5, linkage='average', affinity='l1', connectivity=connectivity_2_clusters)
Z5.fit_predict(datao[columns_to_categorize])
print(Z5.labels_)

  connectivity, n_connected_components = _fix_connectivity(


[0 2 0 ... 1 1 1]


In [168]:
df['labels'] = Z5.labels_

In [169]:
df.head()

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier,labels
0,mid,low,mid,mid,high,mid,low,mid,mid,mid,low,mid,dws,0,0,0
1,mid,low,mid,mid,high,mid,low,mid,mid,low,mid,low,dws,1,0,2
2,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,2,0,0
3,mid,low,mid,mid,high,mid,mid,mid,mid,low,low,mid,dws,3,0,0
4,mid,low,mid,mid,high,mid,low,mid,mid,low,low,mid,dws,4,0,0


In [170]:
df_temp = df[['outlier', 'labels']]
counts = df_temp.groupby('outlier')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels         0       1        2    3      4
outlier                                      
0        17210.0  4095.0  11591.0  4.0    0.0
1            1.0   174.0      0.0  0.0  157.0


In [171]:
df_temp = df[['type', 'labels']]
counts = df_temp.groupby('type')['labels'].value_counts().unstack().fillna(0)
print(counts)


labels       0       1       2    3      4
type                                      
dws     3812.0   163.0  1777.0  0.0    0.0
jog     2398.0  1485.0  1641.0  4.0    0.0
sit        0.0     0.0     0.0  0.0  157.0
std        1.0   174.0     0.0  0.0    0.0
ups     3355.0   723.0  2641.0  0.0    0.0
wlk     7645.0  1724.0  5532.0  0.0    0.0


### Winner: connectivity = 2, 5 clusters with L2

### Plot Clusters

In [172]:
# fig, ax = plt.subplots(figsize=(8, 8))

# scatter = ax.scatter(df['latitude'], df['longitude'], c=Z5.labels_, cmap='rainbow')

# legend = ax.legend(*scatter.legend_elements(), title="Clusters", bbox_to_anchor=(1, 1))
# ax.add_artist(legend)

# for index, row in df.iterrows():
#     plt.annotate(str(row['name']), xy=(row['latitude'], row['longitude']), xytext=(1, 1), textcoords='offset points')
#     labels.append(row['name'])

# plt.xlabel('Latitude')
# plt.ylabel('Longitude')
# plt.title('Countries')
# plt.locator_params(nbins=10)
# plt.grid()

# plt.show()