In [None]:
#!pip install datasets sentence-transformers scikit-learn

# Task: Text Clustering with Custom KMeans

Using sentence_transformers, convert a set of texts into embeddings, and apply clustering using a custom implementation of KMeans. The goal is to group similar texts based on different distance metrics (such as cosine similarity, Euclidean distance, or Manhattan distance) implemented in the KMeans algorithm.

## 1. Data loading

Load the GO Emotions dataset from Hugging Face to obtain the texts that will be clustered. If you prefer some other dataset you can use it.

In [None]:
from datasets import load_dataset

dataset = load_dataset('google-research-datasets/go_emotions', split='train')


Print some examples

In [None]:
for i in range(5):
    print(f"Text {i+1}: {dataset[i]['text']}")

## 2. Convert texts to embeddings

Use the sentence-transformers library to transform the texts into embeddings (numerical vectors) for clustering.

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

#TODO

print(f"Shape of embeddings: {embeddings.shape}")


## 3. Implement custom KMeans

Create a custom implementation of KMeans, supporting at leas four different distance metrics (Correlation, Euclidean, Cosine, Mahalanobis). You shouldn't use any specific libs, only numpy.
Here are the formulas for correlation distance and Mahalanobis distance:

#### Correlation Distance
Correlation distance measures the difference in the direction of vectors rather than their magnitude. The formula for correlation distance is:

$$
d_{\text{corr}}(a, b) = 1 - \frac{\sum_{i=1}^{n}(a_i - \bar{a})(b_i - \bar{b})}{\sqrt{\sum_{i=1}^{n}(a_i - \bar{a})^2} \sqrt{\sum_{i=1}^{n}(b_i - \bar{b})^2}}
$$

where:
- $a$ and $b$ are two vectors,
- $\bar{a}$ and $\bar{b}$ are the mean values of the components of vectors $a$ and $b$, respectively.

or using ```np.corrcoef```:

$$
d_{\text{corr}}(a, b) = 1 - np.corrcoef(a, b)[0,1]
$$


#### Mahalanobis Distance
Mahalanobis distance accounts for not only the distance between points but also the covariance between them. The formula is:

$$
d_{\text{mahal}}(a, b) = \sqrt{(a - b)^T S^{-1} (a - b)}
$$

where:
- $a$ and $b$ are two vectors,
- $S$ is the covariance matrix of the features,
- $S^{-1}$ is the inverse covariance matrix.


In [None]:
import numpy as np

def euclidean_distance(a, b):
    #TODO

def cosine_distance(a, b):
    #TODO

def mahalanobis_distance(a, b, **kwargs):
    #TODO

def correlation_distance(a, b):
    #TODO

Implement custom K-Means class

In [None]:
class CustomKMeans:
    def __init__(self, n_clusters=3, max_iter=100, distance_metric='euclidean',  **kwargs):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.distance_metric = distance_metric
        self.centroids = None
        self.labels_ = None
        self.kwargs = kwargs # inverted covariation matrix for mahalanodis disntace

    def fit(self, X):
        random_indices = np.random.choice(len(X), self.n_clusters, replace=False)
        self.centroids = X[random_indices]

        for _ in range(self.max_iter):
            self.labels_ = np.array([self._assign_label(x) for x in X])

            new_centroids = np.array([X[self.labels_ == i].mean(axis=0) if len(X[self.labels_ == i]) > 0 else self.centroids[i] for i in range(self.n_clusters)])

            if np.all(self.centroids == new_centroids):
                break
            self.centroids = new_centroids

    def _assign_label(self, x):
        # assigns each data point to the nearest centroid based on the chosen distance metric.
        # method returns the index of the closest centroid, which represents the cluster assignment (or label) for the given data point x
        #TODO
        return np.argmin(distances)


## 4. Fit the custom K-Means

Fit the custom KMeans model to the dataset using all of the distance metrics and obtain cluster labels.

In [None]:
n_clusters = 10
#cov_matrix = np.cov(embeddings, rowvar=False)
#inv_cov_matrix = np.linalg.inv(cov_matrix)

custom_kmeans = CustomKMeans(n_clusters=10, distance_metric='correlation')
custom_kmeans.fit(embeddings)

labels = custom_kmeans.labels_

print("Custom KMeans labels for the first 10 texts:")
print(labels[:10])


## 5. Visualize the results

Visualize the clusters by reducing the dimensionality of the embeddings using PCA and plotting the clusters in 2D space.  
You should get something like this:

<a href="https://ibb.co/nRY9hQf"><img src="https://i.ibb.co/zNBpKPb/output.png" alt="output" border="0"></a>

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

#TODO

Let's print examples of the text for each cluster

In [None]:
for cluster in range(n_clusters):
    print(f"\nCluster {cluster}:")
    cluster_texts = [texts[i] for i in range(len(texts)) if labels[i] == cluster]
    for text in cluster_texts[:5]:
        print(f"- {text}")

## 6. Report

Make a conclusion and write a short report. What are the differnes between the methods used? What are their limitations? What is the applicability of each?