# Setting up

In [None]:
!git clone https://github.com/mciprian/ml_class_content.git
!mv ml_class_content/notebooks/img img
!mv ml_class_content/notebooks/data data
!unzip data/creditcard.zip -d data/
!rm data/creditcard.zip
!rm -fr ml_class_content/


# Methods. Unsupervised. Kmeans

## Working with no-labeled data

No labeled data is a central problem in many applications of machine learning. The main goal of unsupervised machine learning methods are to find or infer "labels" from data without one.

Kmeans is a simple but powerfull method (very parallelizable) to discover groups among datasets and therefore propose a label.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
from sklearn.datasets import make_blobs

X, y = make_blobs(
    n_samples=50, centers=3, random_state=42, cluster_std=[2.5, 1.5, 2.8]
)

# Plot data
plt.scatter(X[:, 0], X[:, 1])
plt.show()

 <span style="color:blue">Discuss about the "shaping"/"groups" in the figure. Is there a label ?</span>

  <span style="color:blue">Propose a method to "label" the groups </span>

#### One center, one group

We use the next error function $\frac{1}{n} (c_k - x_i)^2$ where $c_k$ represents the $k$ center and $x_i$ the $i$ point.

In [None]:
center_A = np.asarray([-12, -3])

plt.scatter(X[:, 0], X[:, 1])
plt.scatter(center_A[0], center_A[1], label="center A", c="orange", marker="x", s=150)
plt.legend()
plt.show()

print("Center A error %f" % np.mean(np.power(center_A - X, 2)))

<span style="color:blue">Propose the best center (the lowest error) </span>

#### Two center, two groups

In [None]:
center_A = np.asarray([-5, -3])
center_B = np.asarray([5, 3])


plt.scatter(X[:, 0], X[:, 1])
plt.scatter(center_A[0], center_A[1], label="center A", c="orange", marker="x", s=150)
plt.scatter(center_B[0], center_B[1], label="center B", c= "red", marker="x", s=150)
plt.legend()
plt.show()

print("Center A error %f" % np.mean(np.power(center_A - X, 2)))
print("Center B error %f" % np.mean(np.power(center_B - X, 2)))

<span style="color:blue">Taking account to the "visible groups", these metric make sense ?</span>

<span style="color:blue">Propose a method to assing each point to one cluster </span>

#### Group assigment of every point

In [None]:
def plot_points_centers(X, center_A, center_B, assignemt, title=None):
    """
    """
    plt.scatter(
        X[:, 0], X[:, 1], c=["orange" if e == True else "red" for e in assignemt]
    )
    plt.scatter(
        center_A[0], center_A[1], label="center A", c="orange", marker="x", s=150
    )
    plt.scatter(center_B[0], center_B[1], label="center B", c="red", marker="x", s=150)
    plt.legend()
    if title:
        plt.title(title)
    plt.show()


# Set initial centers
center_A = np.asarray([-5, -3])
center_B = np.asarray([5, 3])

# Calculate errors for every point to the centers
errors_center_a = np.mean(np.power(X - center_A, 2), axis=1)
errors_center_b = np.mean(np.power(X - center_B, 2), axis=1)
# Get minimum center distance
assignemt = errors_center_a < errors_center_b

# Plot points, centers and assigment labels
plot_points_centers(X, center_A, center_B, assignemt)

print("Center A error %f" % np.mean(np.power(center_A - X[assignemt == True], 2)))
print("Center B error %f" % np.mean(np.power(center_B - X[assignemt == False], 2)))

<span style="color:blue">Propose new values for the two centers, trying to minimize errors</span>

## K-means

K-means is a clustering method (vector quantization i.e.: modeling of probability density functions by the distribution of prototype vectors), that aims to partition $n$ observations into $k$ clusters.

The next function, guide the search of the best position for every center:

$$\text{arg}\min_{\hspace{-1in}S}\ \sum_{k=1}^C \sum_{i=1}^n ||c_k - x_i||^2$$

In [None]:
center_A = np.asarray([-5, -3])
center_B = np.asarray([5, 3])

# Get
errors_center_a = np.mean(np.power(X - center_A, 2), axis=1)
errors_center_b = np.mean(np.power(X - center_B, 2), axis=1)
assignemt = errors_center_a < errors_center_b

plot_points_centers(X, center_A, center_B, assignemt)

print("Center A error %f" % np.mean(np.power(center_A - X[assignemt==True], 2)))
print("Center B error %f" % np.mean(np.power(center_B - X[assignemt==False], 2)))

#### Update centers, by using average values on every membership category.

In [None]:
def my_two_kmeans(X, center_A, center_B, n_its=2, debug=True):
    """
    """

    def calculate_centroid_errors(X, center_A, center_B):
        """
        Calculate errors for every point and center
        """
        errors_center_a = np.mean(np.power(X - center_A, 2), axis=1)
        errors_center_b = np.mean(np.power(X - center_B, 2), axis=1)
        return errors_center_a, errors_center_b

    def calculate_intertia(X, assignemt, center_A, center_B):
        """
        Calculate total inertia of every cluster
        """
        return sum(
            sum((np.power(center_A - X[assignemt == True], 2)))
            + sum((np.power(center_B - X[assignemt == False], 2)))
        )

    def calculate_centroid_memebership(errors_center_a, errors_center_b):
        """
        Assign every dataset point to a centroid membership (closest one)
        """
        return errors_center_a < errors_center_b

    def print_distortion_per_centroid(X, assignemt, center_A, center_B):
        """
        Calculate distortion per centroid and print value
        """
        print(
            "Center A error %f" % np.mean(np.power(center_A - X[assignemt == True], 2))
        )
        print(
            "Center B error %f" % np.mean(np.power(center_B - X[assignemt == False], 2))
        )

    # calculations
    errors_center_a, errors_center_b = calculate_centroid_errors(X, center_A, center_B)
    assignemt = calculate_centroid_memebership(errors_center_a, errors_center_b)
    if debug:
        plot_points_centers(X, center_A, center_B, assignemt, "Episode 0")
        print_distortion_per_centroid(X, assignemt, center_A, center_B)
    print("Inertia: %f" % calculate_intertia(X, assignemt, center_A, center_B))

    # Adjust centers
    for i in range(n_its):
        # Calculations
        errors_center_a, errors_center_b = calculate_centroid_errors(
            X, center_A, center_B
        )
        assignemt = calculate_centroid_memebership(errors_center_a, errors_center_b)

        # Update centers
        # ===============================================
        center_A = np.mean(X[assignemt == True], axis=0)
        center_B = np.mean(X[assignemt == False], axis=0)
        # ===============================================

        # Plot
        if debug:
            plot_points_centers(
                X, center_A, center_B, assignemt, "Episode %s" % str(i + 1)
            )
            print_distortion_per_centroid(X, assignemt, center_A, center_B)
        print("Inertia: %f" % calculate_intertia(X, assignemt, center_A, center_B))

    return calculate_intertia(X, assignemt, center_A, center_B)


# Set initial centers
center_A = np.asarray([-5, -3])
center_B = np.asarray([5, 3])

distortion = my_two_kmeans(X, center_A, center_B, n_its=3)

#### Check the results by using the sklearn library

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans = KMeans(n_clusters=2, init="random", random_state=0, n_init=1, verbose=2).fit(X)
kmeans.labels_
kmeans.cluster_centers_
kmeans.inertia_

In [None]:
# Plot
plt.scatter(
    X[:, 0], X[:, 1], c=["orange" if e == True else "red" for e in kmeans.labels_]
)
plt.scatter(
    kmeans.cluster_centers_[1][0],
    kmeans.cluster_centers_[1][1],
    label="center A",
    c="orange",
    marker="x",
    s=150,
)
plt.scatter(
    kmeans.cluster_centers_[0][0],
    kmeans.cluster_centers_[0][0],
    label="center B",
    c="red",
    marker="x",
    s=150,
)
plt.legend()
plt.show()

 <span style="color:blue">Discuss about the number of centroids, the distortion and the structure of the dataset</span>


 <span style="color:blue">Are the number of centroids valid for the structure given by the data ?</span>


# Understanding Clustering metrics

In [None]:
from IPython.display import YouTubeVideo
# https://www.youtube.com/watch?v=AtxQ0rvdQIA
YouTubeVideo('AtxQ0rvdQIA',width=800, height=600)

#### How many centroids ?

ELBOW method can be used to guess the best number of groups, simply the sum of squared differences between an example and it's corresponding representative centroid. Here the key concept is to incrementally increasing the number of centroids used.

In [None]:
n_centroids = 10
distortions = []
silhouettes = []
for n in range(2, n_centroids+1):
  k_model = KMeans(n_clusters=n, init="random", random_state=0, n_init=1, verbose=0).fit(X)
  distortions.append(
      k_model.inertia_
    )
  silhouettes.append(
    silhouette_score(X, k_model.predict(X))
  )

plt.plot(range(2, n_centroids+1), distortions)
plt.xlabel("number of centroids")
plt.ylabel("distortion")
plt.title("Distortions by k")
plt.show()

plt.plot(range(2, n_centroids+1), silhouettes)
plt.xlabel("number of centroids")
plt.ylabel("silhouettes")
plt.title("Silhouettes by k")
plt.show()

  <span style="color:blue">Propose the best number of clusters </span>
  
  <span style="color:blue">Plot your solution and compare with other collegues </span>

## Kmeans exercice with financial data

In [None]:
df_credit = pd.read_csv("data/creditcard.csv", delimiter=",", quotechar='"')
df_credit.head()

 <span style="color:blue">Calculate best number of clusters in data</span>

In [None]:
n_centroids = 40
distortions = []
silhouettes = []
k_models = []
use_silhouette = False
train_data = df_credit[["V%s" % str(i+1) for i in range(27)]]
for n in range(2, n_centroids+1):
  k_model = KMeans(n_clusters=n, init="random", random_state=0, n_init=1, verbose=0).fit(train_data)
  distortions.append(
      k_model.inertia_
      )
  if use_silhouette:
    silhouettes.append(
      silhouette_score(train_data, k_model.predict(train_data))
    )
  k_models.append(k_model)

plt.plot(range(2, n_centroids+1), distortions)
plt.xlabel("number of centroids")
plt.ylabel("distortion")
plt.title("Distortions by k")
plt.show()

if use_silhouette:
  plt.plot(range(2, n_centroids+1), silhouettes)
  plt.xlabel("number of centroids")
  plt.ylabel("silhouettes")
  plt.title("Silhouettes by k")
  plt.show()

In [None]:
model = KMeans(n_clusters=15, init="random", random_state=0, n_init=1, verbose=0).fit(
    df_credit[["V%s" % str(i + 1) for i in range(27)]]
)

In [None]:
groups_counts = {}
groups_size = {}
for t in [pair for pair in zip(model.labels_, df_credit["Class"])]:
    if t[0] not in groups_counts:
        groups_counts[t[0]] = t[1]
        groups_size[t[0]] = 0
    groups_counts[t[0]] += t[1]
    groups_size[t[0]] += 1

In [None]:
plt.bar(groups_counts.keys(), groups_counts.values(), 1, color="black")
plt.title("Counts of Class")
plt.show()
plt.bar(groups_size.keys(), groups_size.values(), 1, color="green")
plt.title("Size of each cluster")
plt.show()

In [None]:
plt.bar(groups_size.keys(), [v_class/s_cluster for v_class,s_cluster in zip(groups_counts.values(),groups_size.values())], 1, color="green")
plt.title("Fraud ratio per cluster")
plt.show()

 <span style="color:blue">Discuss the results</span>
    
<span style="color:blue">Is the clustering valid to detect fraud?</span>

# Beyond K-means

There are plenty of clustering algorithms, let's see some:



## DBScan

In [None]:
from IPython.display import YouTubeVideo
# https://www.youtube.com/watch?v=AtxQ0rvdQIA
YouTubeVideo('RDZUdRSDOok',width=800, height=600)

In [None]:
from sklearn.cluster import DBSCAN

dbscan_model = DBSCAN(eps=3, min_samples=4).fit(X)

In [None]:
plt.scatter(
    X[:, 0], X[:, 1], c=dbscan_model.labels_
)

See also: [HDBScan](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html#sklearn.cluster.HDBSCAN)

# Agglomerative Clustering

In [None]:
# from https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html

import numpy as np
from scipy.cluster.hierarchy import dendrogram

from sklearn.cluster import AgglomerativeClustering

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

agg_model = AgglomerativeClustering(distance_threshold=0, n_clusters=None).fit(X)

plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agg_model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
