-
Notifications
You must be signed in to change notification settings - Fork 4
/
weighted_kmeans.R
65 lines (64 loc) · 2.35 KB
/
weighted_kmeans.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#weighted_kmeans function
weighted_kmeans = function(df, K) {
#df is a dataframe with columns: city name, longitude, latitude, population (as weight)
#K is number of clusters desired
#initialize containers
list_cluster = list()
vect_sse = vector()
#iterate algorithm 3 times (due to local minima)
for (iter in c(1:3)) {
#initialization
init_centroids_index = sample(nrow(df),K)
distance_matrix = matrix(data = NA, nrow = nrow(df), ncol = K)
cluster = vector()
centroid_long = vector()
centroid_lat = vector()
#compute distance between cities and initial centroids (haversine distance)
for (k in c(1:K)) {
for (i in c(1:nrow(df))) {
city_i = as.numeric(df[i,2:3])
centroid_k = as.numeric(df[init_centroids_index[k],2:3])
distance_matrix[i,k] = haversine_dist(city_i,centroid_k)
}
}
#initial cluster assignment for each city
for (i in c(1:nrow(df))) {
cluster[i] = which.min(distance_matrix[i,])
}
#iteration baseline
old_cluster = vector(length = length(cluster))
new_cluster = cluster
#iterations
while (!all(old_cluster == new_cluster)) {
#update old cluster assignment
old_cluster = new_cluster
#calculate centroids (weighted average)
for (k in c(1:K)) {
cluster_k = which(old_cluster == k) #city index of cluster k
centroid_long[k] = weighted.mean(df$longitude[cluster_k], df$population[cluster_k])
centroid_lat[k] = weighted.mean(df$latitude[cluster_k], df$population[cluster_k])
}
df_centroid = as.data.frame(cbind(centroid_long, centroid_lat))
#compute distance between cities and centroids
for (k in c(1:K)) {
for (i in c(1:nrow(df))) {
city_i = as.numeric(df[i,2:3])
centroid_k = as.numeric(df_centroid[k,])
distance_matrix[i,k] = haversine_dist(city_i,centroid_k)
}
}
#update cluster assignment for each city
for (i in c(1:nrow(df))) {
cluster[i] = which.min(distance_matrix[i,])
}
#update new_cluster
new_cluster = cluster
}
#algor iterations result
list_cluster[[iter]] = cluster #cluster assignment
vect_sse[iter] = SSE(df, cluster) #sum of squares error
}
#function result
best_iter = which.min(vect_sse)
return(list_cluster[[best_iter]]) #best cluster assignment (smallest SSE)
}