# Labolatorium nr 4

## Import bibliotek

In [16]:
import numpy as np
from collections import Counter
from sklearn.cluster import DBSCAN
from time import time
import pandas as pd

## Zadanie 1 - implementacja metryk

In [17]:
def tokenize(text, n=2):
    """Create a 'vector' of n-grams from text"""
    return Counter([text[i:i + n] for i in range(len(text) - n + 1)])

def vector_norm(ngram):
    return np.sqrt(np.sum([value ** 2 for value in ngram.values()]))

### Metryka cosinusowa

$$
    \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}= \frac{\sum\limits_{i=1}^{n} A_i B_i}{\sqrt{\sum\limits_{i=1}^{n} A_i^2} \sqrt{\sum\limits_{i=1}^{n} B_i^2}}
    \qquad\begin{aligned}
    &\text{where:} \\
    &\mathbf{A}\text{ and }\mathbf{B} \text{ are the two vectors being compared}\\
    &n \text{ is the dimensionality of the vectors}\\
    &\theta \text{ represents the angle between two vectors } \mathbf{A} \text{ and } \mathbf{B} \text{ in a high-dimensional space}
    \end{aligned}
$$

The dot product of $\mathbf{A}$ and $\mathbf{B}$ is divided by the product of their Euclidean lengths to normalize the result to a range of [-1, 1]. A value of 1 indicates that the two vectors are identical, while a value of -1 indicates that they are completely dissimilar.


In [18]:
def cosine_metric(x, y, n=3):
    x_vec = tokenize(x, n)
    y_vec = tokenize(y, n)

    result = sum([value * y_vec[key]  for key, value in x_vec.items() if key in y_vec])

    normalize_factor = vector_norm(x_vec) * vector_norm(y_vec)
    return 1 - result / normalize_factor if normalize_factor != 1 else 1

### Metryka Euklidesowa

$$
    d(x,y) = \sqrt{\sum_{i=1}^{n}(x_i-y_i)^2}
    \qquad\begin{aligned}
    &\text{where:} \\
    &d(x,y) \text{ is the Euclidean distance} \\
    &x_i, y_i \text{ are the values of the i-th dimension of vectors } x \text{ and } y \\
    &n \text{ is the number of dimensions in the vectors}
    \end{aligned}
$$

In [19]:
def euclidean_metric(x, y, n=3):
    x_vec = tokenize(x, n)
    y_vec = tokenize(y, n)

    return np.sqrt(sum([(x_vec[key] - y_vec[key]) ** 2 if key in y_vec else x_vec[key] ** 2  for key in x_vec ]) + sum([y_vec[key] ** 2 for key in y_vec if key not in x_vec]))

## Metryka DICE
$$
    \text{Dice}(A, B) = \frac{2 |A \cap B|}{|A| + |B|} 
    \qquad\begin{aligned}
    &\text{where:} \\
    &A \text{ and } B \text{ represent the two sets being compared} \\
    &|A| \text{ and } |B| \text{ represent the cardinality (number of elements) of the sets} \\
    &\text{and } |A \cap B| \text{ represents the size of the intersection of the two sets}
    \end{aligned}
$$


In [20]:
def dice_metric(x,y,n=3):
    x_vec = tokenize(x, n)
    y_vec = tokenize(y, n)

    x_size = sum(x_vec.values())
    y_size = sum(y_vec.values())
    intersection = sum([min(x_vec[key], y_vec[key]) for key in x_vec if key in y_vec])

    return 2 * intersection / (x_size + y_size)

### Metryka LCS - Longest Common Subsequence

In [21]:
def lcs_metric(x, y):
    n = len(x)
    m = len(y)

    D = [[0 for j in range(m + 1)] for i in range(n + 1)]
    
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if x[i - 1] == y[j - 1]:
                D[i][j] = D[i - 1][j - 1] + 1
            else:
                D[i][j] = max(D[i - 1][j], D[i][j - 1])

    # Normalize return value.
    return 1 - D[-1][-1] / max(len(x), len(y))

## Zadanie 2 - implementacja indeksu Daviesa-Bouldina

In [22]:
def find_centroid(cluster, metric, *args):
    distances = [0 for _ in range(len(cluster))]
    for i in range(len(cluster)):
        for j in range(i):
            dist = metric(cluster[i], cluster[j], *args)
            distances[i] += dist
            distances[j] += dist

    return cluster[distances.index(min(distances))]

def average_distance(cluster, metric, *args):
    if len(cluster) < 2:
        return 0
    elif len(cluster) == 2:
        return metric(cluster[0], cluster[1], *args)

    return sum([metric(cluster[i], cluster[j], *args)  for i in range(len(cluster)) for j in range(i)]) / ((len(cluster) - 2) * (len(cluster) - 1))

def davies_bouldin_index(clusters, metric, *args):
    centroids = [find_centroid(c, metric, *args) for c in clusters]
    avg_distance = [average_distance(c, metric, *args) for c in clusters]
    calculations = [np.max([(avg_distance[i] + avg_distance[j]) / metric(centroids[i], centroids[j], *args)
                   for i in range(len(clusters)) if i != j]) for j in range(len(clusters))]
    return np.sum(calculations) / len(clusters)

## Zadanie 3 - stoplista

In [23]:
def stop_list(text, frequency):
    words = []
    for line in text:
        words += line.split()
    counter = Counter(words)
    common = {key for key, value in counter.items() if value >= frequency * len(words)}

    # Remove common words
    result = []
    for line in text:
        result.append(" ".join([w for w in line.split() if w not in common]))
    return result

## Zadanie 4 - klasteryzacja

In [24]:
def read_text(file, n):
    with open(file, "r", encoding="UTF-8") as f:
        text = f.read().splitlines()
    return text[:n]

In [25]:
def make_clustering(text, metric_func, eps, stop_list_freq=None, *args):
    working_text = stop_list(text, stop_list_freq) if stop_list_freq else text

    X = np.arange(len(working_text)).reshape(-1, 1)
    clustering = DBSCAN(metric=lambda x,y: metric_func(working_text[int(x[0])], working_text[int(y[0])], *args),
                        min_samples=1, 
                        eps=eps).fit_predict(X)
    
    clusters = [[] for _ in range(max(clustering) + 1)]
    for i in range(len(clustering)):
        clusters[clustering[i]].append(text[i])
        
    return clusters

## Zadanie 5 - porównanie wyników

In [26]:
def test(text, metric_list, eps, use_stop_list=False):
    df_results = {"Metric": [], "Epsilon": [], "Davies-Bouldin index": [],  "Time [s]": []}
    result = []

    frequency = None
    if use_stop_list:
        frequency = 0.01

    for i in range(len(metric_list)):
        start_timer = time()   
        clusters = make_clustering(text, metric_list[i][1], eps[i], frequency)
        time_elapsed = time() - start_timer

        df_results["Metric"].append(metric_list[i][0])
        df_results["Epsilon"].append(eps[i])
        df_results["Time [s]"].append(time_elapsed)
        df_results["Davies-Bouldin index"].append(davies_bouldin_index(clusters, metric_list[i][1]))

        result.append(clusters)

    return pd.DataFrame(df_results), result

In [27]:
metric_test = [("Cosine metric", cosine_metric), 
               ("Euclidean metric", euclidean_metric), 
               ("LCS metric", lcs_metric)]

eps = [0.4, 0.6, 0.8]

print("Test with stop list\n")
df_results, result_stop_list = test(read_text("lines.txt", 100), metric_test, eps, True)
print(df_results)

print("\nTest without stop list\n")
df_results, result_no_stop_list = test(read_text("lines.txt", 100), metric_test, eps, False)
print(df_results)

Test with stop list

             Metric  Epsilon  Davies-Bouldin index   Time [s]
0     Cosine metric      0.4              0.555522   0.711588
1  Euclidean metric      0.6              0.134085   0.557655
2        LCS metric      0.8              0.377818  16.565760

Test without stop list

             Metric  Epsilon  Davies-Bouldin index   Time [s]
0     Cosine metric      0.4              0.555522   0.733831
1  Euclidean metric      0.6              0.000000   0.590437
2        LCS metric      0.8              0.377818  18.526075


Do analizy skorzystano z krótkiego pliku zawierającego pierwsze 100 linii tekstu, aby ograniczyć czas obliczeń. Wartości epsilon zostały dobrane tak, aby klastry jak najlepiej odpowiadały plikom dostarczonym w zadaniu. Użycie stop listy nie wpłynęło istotnie na podział na klastry, ale zmieniło wartości indeksów i minimalnie różniło czas wykonania.

Najszybsza metryka to Euklidesowa, następnie cosinusowa, a najwolniejsza to LCS.

In [28]:
def print_all_clusters(clusters, n):
    j = 0
    for i in range(len(clusters)):
        if len(clusters[i]) != -1:
            for line in clusters[i]:
                print(line)
            print("===============\n")
            j += 1

        if j == n:
            break

In [29]:
print("Stop list clusters\n")
print_all_clusters(result_stop_list, 5)

Stop list clusters

['/11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA']
["''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611"]
["''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669", "''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--", '"SSONTEX" SP.ZO.O IMPORT-EXPORT 03-879 WARSZAWA UL PRZECLAWSKA 5 NIP:113-01-17-669']
["''TOPEX SP. Z O.O.'' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND"]
["'MASTER PLUS CO.,LTD.' 143000,RUSSIA,MO,ODINSOVO, MOJAISKOE, SHOSSE,153G TEL:+7495 7273939"]
['"2TIGERS GROUP LIMITED"  ROOM 504 JINSHAZHOU SHANGSHUI ROAD,  GUANGZHOU 510160']
['"ALDETRANS" LLC, 105066, MOSCOW, RUSSIA, TOKMAKOV LANE, 11. TEL:+7(495)641-03-89']
['"A-LIFT",JSC 1 PROSPEKT MARSHALA ZHUKOVA,MOSCOW 123308,RUSSIA  T: +7(495)784-7961']
['"A

In [30]:
print("No stop list cluster\n")
print_all_clusters(result_no_stop_list, 5)

No stop list cluster

['/11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA']
["''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611"]
["''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669", "''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--", '"SSONTEX" SP.ZO.O IMPORT-EXPORT 03-879 WARSZAWA UL PRZECLAWSKA 5 NIP:113-01-17-669']
["''TOPEX SP. Z O.O.'' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND"]
["'MASTER PLUS CO.,LTD.' 143000,RUSSIA,MO,ODINSOVO, MOJAISKOE, SHOSSE,153G TEL:+7495 7273939"]
['"2TIGERS GROUP LIMITED"  ROOM 504 JINSHAZHOU SHANGSHUI ROAD,  GUANGZHOU 510160']
['"ALDETRANS" LLC, 105066, MOSCOW, RUSSIA, TOKMAKOV LANE, 11. TEL:+7(495)641-03-89']
['"A-LIFT",JSC 1 PROSPEKT MARSHALA ZHUKOVA,MOSCOW 123308,RUSSIA  T: +7(495)784-7961']
['

## Zadanie 6 - pomysły na poprawę jakości klasteryzacji

Pomysły na lepszą klasteryzację:
1. Lepsze dopasowanie parametrów - w szczegolności epsilon, min_sample. Mniejsze wartości epsilon skutkują gęstszymi klastrami, podczas gdy większe wartości prowadzą do większych i bardziej uogólnionych klastrów. Wyższe wartości min_sample skutkują bardziej konserwatywnym grupowaniem, podczas gdy niższe wartości mogą obejmować zaszumione punkty jako część klastrów. Warto zatem znaleźć optymalne wartości tych parametrów.
2. Inne metryki - w szczególności LCS, która jest najwolniejsza, a nie daje najlepszych wyników. Można spróbować innych metryk, np. Levenshteina.
3. Eksperymentowanie z innymi metodami klasteryzacji - można wypróbować inne techniki klasteryzacji, takie jak K-means, hierarchical clustering lub density-based seeding, aby uzyskać lepsze wyniki grupowania.