In [1]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np
import queue
import threading
import random
import pandas as pd
from importAirbnb import *
from KMeans import *

# Import dataset Airbnb

In [2]:
X = importAirbnb('./data/AB_NYC_2019.csv')

In [3]:
n, d = X.shape
# chosen
n_clusters = 4

# k-Means implementation

In [4]:
kmeans = KMeans(X, n_clusters=n_clusters)

In [5]:
kmeans_speculation = KMeans_Speculation(X, n_clusters)

In [None]:
kmeans_speculation.fit()

# Fit k-Means speculated

In [7]:
# kmeans_s = KMeans_Speculation(n_clusters)
# kmeans_s.fit(X)

In [8]:
# for cluster in range(n_clusters):
#         plt.scatter(X[kmeans_s.labels == cluster, 0], X[kmeans_s.labels == cluster, 1], s=10, label=f"Cluster{cluster}")  
# for center in kmeans_s.centroids:
#     plt.scatter(center[0], center[1], s=50)

# TEST

## Initialize both methods

In both cases:
- start with the same initial centroids
- do a first assignment, this way they start with the same assignments

In [9]:
kmeans = KMeans(n_clusters)

In [10]:
kmeans_speculation = KMeans_Speculation(n_clusters)

---
#### Same initial centroids

In [11]:
kmeans.centroids

array([[ 40.65587, -73.94187,  45.     ,   3.     ,   0.     ,   4.     ,
        332.     ],
       [ 40.69315, -73.9565 , 180.     ,   3.     ,   1.     ,   1.     ,
          0.     ],
       [ 40.68171, -73.92085, 110.     ,  10.     ,  58.     ,   1.     ,
        282.     ],
       [ 40.71527, -73.89829,  70.     ,   1.     ,   1.     ,   1.     ,
        364.     ]])

In [12]:
kmeans_speculation.centroids

array([[ 40.80564, -73.94198,  65.     ,   2.     ,  15.     ,   2.     ,
        292.     ],
       [ 40.71425, -73.84503, 100.     ,  14.     ,   0.     ,   1.     ,
          0.     ],
       [ 40.76097, -73.99959, 200.     ,   4.     ,  38.     ,   1.     ,
        159.     ],
       [ 40.68282, -73.92934, 335.     ,   2.     ,  21.     ,   1.     ,
        304.     ]])

In [13]:
kmeans_speculation.centroids = kmeans.centroids 

---
#### Perform assignment on both

In [14]:
kmeans.labels = kmeans.predict(X)

In [15]:
kmeans_speculation.labels, kmeans_speculation.e = kmeans_speculation.predict(X)

In [16]:
(kmeans.labels != kmeans_speculation.labels).sum()

0

---

#### Normal k-means
- update centroids
- perform assignment

In [17]:
%%time
kmeans.update_centroid(X)


Wall time: 8.01 ms


In [18]:
%%time
kmeans.labels = kmeans.predict(X)

Wall time: 616 ms


---

#### Speculated k-means
- speculate centroids
- do assignment on speculated centroids
- compute correct centroids
- correct assignments

##### Speculate centroids

In [19]:
kmeans_speculation.mask = np.random.choice([True, False], size=X.shape[0], p=[kmeans_speculation.subsample_size, (1-kmeans_speculation.subsample_size)])

In [20]:
%%time
kmeans_speculation.speculate_centroid(X)

Wall time: 3.03 ms


In [21]:
kmeans_speculation.centroids

array([[ 40.72111199, -73.93310276,  49.0255102 ,  14.77040816,
         14.37244898,  12.48469388, 319.10204082],
       [ 40.73073732, -73.95594933, 151.35891089,   5.29733911,
         15.9897896 ,   2.35086634,  26.57116337],
       [ 40.72850502, -73.95084804, 173.68853821,   9.8961794 ,
         45.64700997,  19.55398671, 250.55730897],
       [ 40.7282793 , -73.94355656, 101.81111111,   8.70740741,
          9.70740741,  12.71851852, 355.59259259]])

##### Do assignment on speculated centroids

In [22]:
kmeans_speculation.prev_labels = kmeans_speculation.labels

In [23]:
%%time
kmeans_speculation.labels, kmeans_speculation.e = kmeans_speculation.predict(X)

Wall time: 984 ms


##### Compute correct centroids

In [24]:
kmeans_speculation.speculated_centroids = kmeans_speculation.centroids

In [25]:
%%time
kmeans_speculation.update_centroid(X, kmeans_speculation.prev_labels)

Wall time: 11 ms


In [26]:
print("Difference between speculated centroids and correct centroids in norm-2:")
np.linalg.norm(kmeans_speculation.speculated_centroids - kmeans_speculation.centroids, 2, 1)

Difference between speculated centroids and correct centroids in norm-2:


array([4.64093004, 6.14175291, 6.15399589, 3.56358869])

##### Correct assignments

In [27]:
kmeans_speculation.previous_labels = kmeans_speculation.labels

In [28]:
%%time
kmeans_speculation.correct(X)

Wall time: 114 ms


In [29]:
print("Number of datapoints that changed assignment after correction:")
(kmeans_speculation.labels != kmeans_speculation.previous_labels).sum()

Number of datapoints that changed assignment after correction:


0

---

#### Check both methods finally give same assignments

In [30]:
print("Number of different assignmented between the 2 methods:")
(kmeans.labels != kmeans_speculation.labels).sum()

Number of different assignmented between the 2 methods:


0

# Plot

In [31]:
for cluster in range(n_clusters):
        plt.scatter(X[y == cluster, 0], X[y == cluster, 1], s=10, label=f"Cluster{cluster}")  
for center in kmeans.centroids:
    plt.scatter(center[0], center[1], s=50)

NameError: name 'y' is not defined

# Artificial data generation

In [None]:
# create artificial clusters

def generate_clusters(n_clusters=3, d=2, n=100):
    centers = np.random.rand(n_clusters,d)*15
    cluster_std = np.random.normal(1, 0.5, n_clusters)
    X, y = make_blobs(n_samples=n, cluster_std=cluster_std, centers=centers, n_features=d, random_state=1)
    
    print("2d plot")
    for cluster in range(n_clusters):
        plt.scatter(X[y == cluster, 0], X[y == cluster, 1], s=10, label=f"Cluster{cluster}")  
    return centers, cluster_std, X, y

In [None]:
n_clusters, d, n = 4, 3, 1000

In [None]:
centers, cluster_std, X, y = generate_clusters(n_clusters, d, n)

# Fit k-Means speculated

In [None]:
kmeans_s = KMeans_Speculation(n_clusters)

In [None]:
kmeans_s.fit(X)

In [None]:
for cluster in range(n_clusters):
        plt.scatter(X[kmeans_s.labels == cluster, 0], X[kmeans_s.labels == cluster, 1], s=10, label=f"Cluster{cluster}")  
for center in kmeans_s.centroids:
    plt.scatter(center[0], center[1], s=50)