<a href="https://colab.research.google.com/github/mjfadaei/PythonML/blob/main/Python7Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

X = np.array([[5,3],
    [10.0,15],
    [15,12],
    [24,10],
    [30,30],
    [85,70],
    [71,80],
    [60,78],
    [70,55],
    [80,91],])
Y = np.array([[0],
    [0],
    [0],
    [1],
    [1],
    [1],
    [1],
    [1],
    [1],
    [1]])

In [None]:
import matplotlib.pyplot as plt

labels = range(0, 10)
plt.figure(figsize=(10, 7))
plt.subplots_adjust(bottom=0.1)
plt.scatter(X[:,0],X[:,1], label='True Position')

for label, x, y in zip(labels, X[:, 0], X[:, 1]):
    plt.annotate(
        label,
        xy=(x, y), xytext=(-3, 3),
        textcoords='offset points', ha='right', va='bottom')
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
import pandas as pd
import scipy
km = KMeans(n_clusters=2, max_iter=50)
km.fit(X)
predicted = km.predict(X)
print(predicted)
print('SSE=', km.inertia_)  #  #returns the SSE value
labels = range(0, 10)
plt.scatter(X[predicted==0,0],X[predicted==0,1],  c='b', label='True Position')
plt.scatter(X[predicted==1,0],X[predicted==1,1],  c='r', label='True Position')
for label, x, y in zip(labels, X[:, 0], X[:, 1]):
    plt.annotate(
        label,
        xy=(x, y), xytext=(-3, 3),
        textcoords='offset points', ha='right', va='bottom')
plt.show()


# ------------------------------------------------------------Evaluation
## -----------------------------------Contingency_matrix
contingency_matrix = metrics.cluster.contingency_matrix(Y, predicted)
contingency_matrix
col=["Predicted 0 ","Predicted 1"]
ind=['0','1']
print(pd.DataFrame(contingency_matrix, index=ind, columns=col))
## -----------------------------------------Purity
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
    
purity=purity_score(Y, predicted)
print("purity=",purity)
## --------------------------------------Entropy
def Entropy_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    Pi=[]
    ei=[]
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    Entropy=0
    for i in range(len(contingency_matrix[0])):
        Pi.append(np.sum(contingency_matrix[:,i]))
        ei.append(0)
        for j in range(len(contingency_matrix)):
            if(contingency_matrix[j,i]!=0):
                ei[i]=-1*np.log(contingency_matrix[j,i]/Pi[i])*contingency_matrix[j,i]/Pi[i]
        Entropy+=ei[i]*Pi[i]
    Entropy=Entropy/sum(Pi)
    return  Entropy

Entropy=Entropy_score(Y, predicted)
print("Entropy=",Entropy)

## --------------------------------------
proximitymatrix = metrics.pairwise_distances(X)
incidencematrix = np.zeros([len(predicted), len(predicted)])
corrmat = np.array([])
for i in range(len(predicted)):
  for j in range(len(predicted)):
    if predicted[i] == predicted[j]:
      incidencematrix[i,j] = 1
    else:
      incidencematrix[i,j] = 0
for i in range(len(predicted)-1):
  for j in range(i+1, len(predicted)):
    if corrmat.size == 0:
      corrmat = np.array([[proximitymatrix[i,j], incidencematrix[i,j]]])
    else:
      corrmat = np.append(corrmat, [[proximitymatrix[i,j], incidencematrix[i,j]]], axis =0)
corr = np.corrcoef(np.transpose(corrmat))
print('Pearsons correlation: %.3f', corr)

In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from scipy.cluster.hierarchy import dendrogram
# ---
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
# ---
modelsingle = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage="single")
modelsingle.fit(X)
print('linkage:single=',modelsingle.labels_)
plot_dendrogram(modelsingle)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

modelcomplete = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage="complete")
modelcomplete.fit(X)
print('linkage:complete=',modelcomplete.labels_)
plot_dendrogram(modelcomplete)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()