-
Notifications
You must be signed in to change notification settings - Fork 132
/
crowdsignals_ch5.py
127 lines (97 loc) · 4.73 KB
/
crowdsignals_ch5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
##############################################################
# #
# Mark Hoogendoorn and Burkhardt Funk (2017) #
# Machine Learning for the Quantified Self #
# Springer #
# Chapter 5 #
# #
##############################################################
from util.VisualizeDataset import VisualizeDataset
from Chapter5.DistanceMetrics import InstanceDistanceMetrics
from Chapter5.DistanceMetrics import PersonDistanceMetricsNoOrdering
from Chapter5.DistanceMetrics import PersonDistanceMetricsOrdering
from Chapter5.Clustering import NonHierarchicalClustering
from Chapter5.Clustering import HierarchicalClustering
import copy
import pandas as pd
import matplotlib.pyplot as plot
import util.util as util
# Of course we repeat some stuff from Chapter 3, namely to load the dataset
DataViz = VisualizeDataset()
# Read the result from the previous chapter, and make sure the index is of the type datetime.
dataset_path = './intermediate_datafiles/'
try:
dataset = pd.read_csv(dataset_path + 'chapter4_result.csv', index_col=0)
except IOError as e:
print('File not found, try to run previous crowdsignals scripts first!')
raise e
dataset.index = dataset.index.to_datetime()
# First let us use non hierarchical clustering.
clusteringNH = NonHierarchicalClustering()
# Let us look at k-means first.
k_values = range(2, 10)
silhouette_values = []
#
## Do some initial runs to determine the right number for k
#
print '===== kmeans clustering ====='
for k in k_values:
print 'k = ', k
dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, 10)
silhouette_score = dataset_cluster['silhouette'].mean()
print 'silhouette = ', silhouette_score
silhouette_values.append(silhouette_score)
plot.plot(k_values, silhouette_values, 'b-')
plot.xlabel('k')
plot.ylabel('silhouette score')
plot.ylim([0,1])
plot.show()
# And run the knn with the highest silhouette score
k = 6
dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(dataset_knn, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label')
del dataset_knn['silhouette']
k_values = range(2, 10)
silhouette_values = []
# Do some initial runs to determine the right number for k
print '===== k medoids clustering ====='
for k in k_values:
print 'k = ', k
dataset_cluster = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, n_inits=10)
silhouette_score = dataset_cluster['silhouette'].mean()
print 'silhouette = ', silhouette_score
silhouette_values.append(silhouette_score)
plot.plot(k_values, silhouette_values, 'b-')
plot.ylim([0,1])
plot.xlabel('k')
plot.ylabel('silhouette score')
plot.show()
# And run k medoids with the highest silhouette score
k = 6
dataset_kmed = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, n_inits=50)
DataViz.plot_clusters_3d(dataset_kmed, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label'])
DataViz.plot_silhouette(dataset_kmed, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(dataset_kmed, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label')
# And the hierarchical clustering is the last one we try
clusteringH = HierarchicalClustering()
k_values = range(2, 10)
silhouette_values = []
# Do some initial runs to determine the right number for the maximum number of clusters.
print '===== agglomaritive clustering ====='
for k in k_values:
print 'k = ', k
dataset_cluster, l = clusteringH.agglomerative_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'euclidean', use_prev_linkage=True, link_function='ward')
silhouette_score = dataset_cluster['silhouette'].mean()
print 'silhouette = ', silhouette_score
silhouette_values.append(silhouette_score)
if k == k_values[0]:
DataViz.plot_dendrogram(dataset_cluster, l)
plot.plot(k_values, silhouette_values, 'b-')
plot.ylim([0,1])
plot.xlabel('max number of clusters')
plot.ylabel('silhouette score')
plot.show()
# And we select the outcome dataset of the knn clustering....
dataset_knn.to_csv(dataset_path + 'chapter5_result.csv')