<a href="https://colab.research.google.com/github/monindew/Hands-on-ML/blob/main/Hands_on_08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

blob_centers = np.array([[ 0.2,  2.3], [-1.5 ,  2.3], [-2.8,  1.8],
                         [-2.8,  2.8], [-2.8,  1.3]])
blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])
X, y = make_blobs(n_samples=2000, centers=blob_centers, cluster_std=blob_std,
                  random_state=7)

k = 5
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
y_pred = kmeans.fit_predict(X)

In [2]:
y_pred

array([0, 0, 4, ..., 3, 1, 0], dtype=int32)

In [3]:
y_pred is kmeans.labels_

True

In [4]:
kmeans.cluster_centers_

array([[-2.80214068,  1.55162671],
       [ 0.08703534,  2.58438091],
       [-1.46869323,  2.28214236],
       [-2.79290307,  2.79641063],
       [ 0.31332823,  1.96822352]])

In [5]:
import numpy as np
X_new = np.array([[0, 2], [3, 2], [-3, 3], [-3, 2.5]])
kmeans.predict(X_new)

array([4, 4, 3, 3], dtype=int32)

In [6]:
kmeans.transform(X_new).round(2)

array([[2.84, 0.59, 1.5 , 2.9 , 0.31],
       [5.82, 2.97, 4.48, 5.85, 2.69],
       [1.46, 3.11, 1.69, 0.29, 3.47],
       [0.97, 3.09, 1.55, 0.36, 3.36]])

In [7]:
good_init = np.array([[-3, 3], [-3, 2], [-3, 1], [-1, 2], [0, 2]])
kmeans = KMeans(n_clusters=5, init=good_init, n_init=1, random_state=42)
kmeans.fit(X)

In [8]:
kmeans.inertia_

211.59853725816828

In [9]:
kmeans.score(X)

-211.59853725816828

In [10]:
from sklearn.cluster import MiniBatchKMeans

minibatch_kmeans = MiniBatchKMeans(n_clusters=5, random_state=42)
minibatch_kmeans.fit(X)

  super()._check_params_vs_input(X, default_n_init=3)


In [11]:
from sklearn.metrics import silhouette_score
silhouette_score(X, kmeans.labels_)

0.655517642572828

In [12]:
import requests
from PIL import Image
import numpy as np
from io import BytesIO

url = "https://github.com/ageron/handson-ml3/blob/main/images/unsupervised_learning/ladybug.png?raw=true"
response = requests.get(url)
img = Image.open(BytesIO(response.content))
image = np.asarray(img)
image.shape


(533, 800, 3)

In [13]:
X = image.reshape(-1, 3)
kmeans = KMeans(n_clusters=8, random_state=42).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)

  super()._check_params_vs_input(X, default_n_init=10)


In [14]:
kmeans.cluster_centers_

array([[  5.6105612 ,  27.61743918,   1.44857047],
       [250.68280046, 237.8319458 ,   6.51237711],
       [ 54.37997208,  97.09151272,  14.07131629],
       [155.1122467 , 160.29920705,  98.37997063],
       [ 93.53208815, 132.47927103,  39.06947451],
       [ 24.39502109,  63.41611633,   4.09118396],
       [195.12454743,  50.81607531,  11.20057929],
       [223.40683449, 182.94424185,   9.39848253]])

In [15]:
X.shape

(426400, 3)

In [16]:
from sklearn.datasets import load_digits

X_digits, y_digits = load_digits(return_X_y=True)
X_train, y_train = X_digits[:1400], y_digits[:1400]
X_test, y_test = X_digits[1400:], y_digits[1400:]

In [17]:
from sklearn.linear_model import LogisticRegression

n_labeled = 50
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])

In [18]:
log_reg.score(X_test, y_test)

0.7481108312342569

In [19]:
k = 50
kmeans = KMeans(n_clusters=k, random_state=42)
X_digits_dist = kmeans.fit_transform(X_train)
representative_digit_idx = np.argmin(X_digits_dist, axis=0)
X_representative_digits = X_train[representative_digit_idx]

  super()._check_params_vs_input(X, default_n_init=10)


In [20]:
y_representative_digits = np.array([
    1, 3, 6, 0, 7, 9, 2, 4, 8, 9,
    5, 4, 7, 1, 2, 6, 1, 2, 5, 1,
    4, 1, 3, 3, 8, 8, 2, 5, 6, 9,
    1, 4, 0, 6, 8, 3, 4, 6, 7, 2,
    4, 1, 0, 7, 5, 1, 9, 9, 3, 7
])

In [21]:
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_representative_digits, y_representative_digits)
log_reg.score(X_test, y_test)

0.06297229219143577

In [22]:
y_train_propagated = np.empty(len(X_train), dtype=np.int64)
for i in range(k):
  y_train_propagated[kmeans.labels_ == i] = y_representative_digits[i]

In [23]:
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train_propagated)
log_reg.score(X_test, y_test)

0.07808564231738035

In [24]:
percentile_closet = 90

X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(k):
  in_cluster = (kmeans.labels_ == i)
  cluster_dist = X_cluster_dist[in_cluster]
  cutoff_distance = np.percentile(cluster_dist, percentile_closet)
  above_cutoff = (X_cluster_dist > cutoff_distance)
  X_cluster_dist[in_cluster & above_cutoff] = -1

partially_propagated = (X_cluster_dist != -1)
X_train_partially_propagated = X_train[partially_propagated]
y_train_partially_propagated = y_train_propagated[partially_propagated]

In [25]:
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)
log_reg.score(X_test, y_test)

0.07304785894206549

In [26]:
(y_train_partially_propagated == y_train[partially_propagated]).mean()

0.09646302250803858

In [27]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.05)
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)

In [28]:
dbscan.labels_

array([ 0,  1,  2,  0,  3,  0, -1, -1,  0,  4,  4,  4,  4,  4,  3,  4,  3,
        4,  0,  3,  3,  1,  4,  7,  3,  4,  4,  2,  0, -1,  3,  4,  0,  1,
        0,  0,  4,  5,  1,  4,  4,  2,  4,  0,  0,  0,  0,  0,  0,  4,  4,
       -1,  1,  0,  4,  0,  2,  4,  4,  0,  1,  2,  0,  4,  2,  5,  2,  2,
        4,  4,  4,  0,  1,  3,  1,  2,  4,  0, -1,  4,  3,  1,  1,  4,  4,
        4,  2,  0,  2,  6,  4,  0,  2,  2,  3,  1, -1,  0,  0,  3, -1,  5,
        4,  2,  2,  1,  5,  4,  3,  4,  3,  7,  0,  0,  0,  3,  2,  4,  6,
        0,  2,  3,  0, -1,  4,  4,  3,  0,  0,  0,  4,  0,  4,  0,  3,  0,
        4,  3,  0, -1,  6,  4,  5,  3,  0,  3,  2,  0,  6,  4,  5,  0, -1,
        0, -1,  0, -1,  2,  0,  4,  2,  5,  3,  3,  1,  2,  4,  5,  5, -1,
        0,  4, -1,  0,  2,  1,  3,  4,  2,  0,  3,  0,  2,  2,  0,  0,  1,
        4,  4,  0,  3,  1,  4,  4,  4,  4,  1,  2,  2,  4,  4,  2,  0,  1,
        0,  4, -1,  1,  3,  2,  3,  3,  3,  4,  0,  3,  0,  0,  4,  4,  0,
        0,  0,  5,  2,  0

In [29]:
len(dbscan.core_sample_indices_)

799

In [30]:
dbscan.core_sample_indices_

array([  0,   1,   2,   3,   4,   5,   8,   9,  10,  12,  14,  15,  16,
        17,  20,  21,  22,  24,  25,  27,  28,  30,  31,  32,  33,  34,
        35,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
        49,  50,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
        63,  64,  65,  66,  67,  68,  69,  71,  72,  73,  74,  75,  76,
        77,  79,  80,  82,  83,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  97,  98,  99, 101, 103, 106, 107, 108, 109, 110,
       112, 114, 115, 116, 117, 119, 120, 122, 125, 126, 127, 128, 129,
       130, 132, 134, 135, 136, 137, 138, 141, 142, 143, 144, 145, 146,
       147, 148, 149, 150, 151, 153, 155, 158, 159, 160, 161, 162, 163,
       164, 165, 166, 167, 168, 170, 171, 173, 174, 175, 176, 177, 179,
       180, 181, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 195,
       196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 207, 209, 210,
       211, 212, 214, 215, 216, 217, 218, 220, 221, 222, 223, 22

In [31]:
dbscan.components_

array([[ 0.292021  , -0.22491662],
       [ 1.61175235, -0.36465843],
       [ 1.9083362 ,  0.11937692],
       ...,
       [ 0.59152733, -0.42612988],
       [ 0.47959728, -0.22567984],
       [ 0.81300211,  0.64439349]])

In [32]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

In [33]:
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
print(knn.predict(X_new))
print(knn.predict_proba(X_new))

[5 0 3 2]
[[0.12 0.   0.   0.   0.02 0.78 0.08 0.   0.  ]
 [0.94 0.   0.   0.   0.04 0.   0.   0.   0.02]
 [0.28 0.   0.   0.72 0.   0.   0.   0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.   0.   0.  ]]


In [34]:
y_dist, y_pred_idx = knn.kneighbors(X_new, n_neighbors=1)
y_pred = dbscan.labels_[dbscan.core_sample_indices_][y_pred_idx]
y_pred[y_dist > 0.2] = -1
y_pred.ravel()

array([-1,  8,  3, -1])