# Zadanie nr 5 - metryki w przestrzeni napisów

In [91]:
from collections import Counter
from itertools import product
import sklearn.cluster
from math import inf

## 1. Metryki

<i>1. Zaimplementuj przynajmniej 3 metryki spośród wymienionych: cosinusowa, LCS, DICE, euklidesowa.</i>

* lcs

In [2]:
def lcs(x, y):
    common = [[0 for _ in range(len(y)+1)] for _ in range(len(x)+1)]
    max_lcs = 0

    for i in range(1, len(x) + 1):
        for j in range(1, len(y)+1):
            common[i][j] = common[i-1][j-1] + 1 if x[i-1] == y[j-1] else 0
            max_lcs = max(max_lcs, common[i][j])

    return 1 - max_lcs/max(len(x), len(y))

#### ngramy

In [3]:
def ngram(x, n):
    ngrams = [x[i:i+n] for i in range(len(x)-n+1)]
    return Counter(ngrams)

* dice

In [4]:
def dice(x, y, n=2):
    ngrams_x, ngrams_y = set(ngram(x, n).keys()), set(ngram(y, n).keys())
    return 1 - 2*len(ngrams_x & ngrams_y)/(len(ngrams_x)+len(ngrams_y))

* euklidesowa

In [5]:
def euclides(x, y):
    ngrams_x, ngrams_y = ngram(x, 2), ngram(y, 2)

    keys = set(ngrams_x.keys()) | set(ngrams_y.keys())
    dist = 0

    for key in keys:
        value_x = ngrams_x.get(key, 0)
        value_y = ngrams_y.get(key, 0)

        dist += (value_x - value_y)**2

    return dist**0.5

* levensheit

In [6]:
def levensheit(text_a, text_b):
    edit = [[None for _ in range(len(text_b) + 1)]
            for _ in range(len(text_a) + 1)]

    def delta(char_a, char_b): return 0 if char_a == char_b else 1

    for i in range(len(text_a) + 1):
        edit[i][0] = i

    for j in range(1, len(text_b) + 1):
        edit[0][j] = j

    for i in range(1, len(text_a)+1):
        for j in range(1, len(text_b)+1):
            edit[i][j] = min(edit[i-1][j] + 1, edit[i][j-1] + 1,
                       edit[i-1][j-1] + delta(text_a[i-1], text_b[j-1]))

    return edit[len(text_a)][len(text_b)]

In [7]:
x = 'BCDF'
y = 'ABCDEF'
print(lcs(x, y))
print(dice(x, y))
print(euclides(x, y))

0.5
0.5
2.0


## 2. Sposoby oceny jakości klasteryzacji

<i>2. Zaimplementuj przynajmniej 2 sposoby oceny jakości klasteryzacji (np. indeks Daviesa-Bouldina).</i>

In [110]:
def sigma(cluster, metric):
    sum_ = 0
    n = len(cluster)
    if n == 1:
        return 0
    
    for i, text_i in enumerate(cluster):
        for text_j in cluster[i+1:]:
                sum_ += metric(text_i, text_j)
                
    return sum_ / (n*(n-1)/2)

def d(cluster_a, cluster_b, metric):
    sum_ = 0
    
    for text_a in cluster_a:
        for text_b in cluster_b:
            sum_ += metric(text_a, text_b)
    
    return sum_/(len(cluster_a)*len(cluster_b))

def d_prim(cluster, metric):
    max_ = 0
    n = len(cluster)
    if n == 1:
        return 0
    
    for i, text_i in enumerate(cluster):
        for text_j in cluster[i+1:]:
                max_ = max(max_, metric(text_i, text_j))
                
    return max_

In [111]:
def davies_bouldin(clusters, metric):
    db = 0
    sigmas = [sigma(cluster, metric) for cluster in clusters]
    for i, cluster_i in enumerate(clusters):
        max_ = 0
        for j, cluster_j in enumerate(clusters[i+1:]):
            max_ = max(max_, (sigmas[i] + sigmas[j])/d(cluster_i, cluster_j, metric))
        db += max_
        
    return db/len(clusters)        

In [112]:
def dunn(clusters, metric):
    min_dist = min((d(cluster_a, cluster_b, metric) for cluster_a in clusters for cluster_b in clusters if cluster_a != cluster_b))
    max_size = max((d_prim(cluster, metric) for cluster in clusters))
    return min_dist/max_size

## 3. Stoplista

<i>3. Stwórz stoplistę najczęściej występujących słów. </i>

In [85]:
def get_most_common_words(text, num):
    words_stats = list(Counter(text.split(' ')).items())
    words_stats.sort(reverse=True)
    return [word for word in words_stats[:num]]

In [86]:
def remove_common(text, num):
    common = get_most_common_words(text, num)
    return [x for x in text if x not in common]

## 4. Klasteryzacja

<i>4. Wykonaj klasteryzację zawartości załączonego pliku (lines.txt) przy użyciu przynajmniej 2 algorytmów oraz metryk zaimplementowanych w pkt. 1. i metryki Levenshteina. Każda linia to adres pocztowy firmy, różne sposoby zapisu tego samego adresu powinny się znaleźć w jednym klastrze. </i>

In [191]:
def cluster(texts, metric, eps):
    distances = [[metric(texts[i], texts[j]) for i in range(len(texts))] for j in range(len(texts))]
#     print(distances)
    clustering = sklearn.cluster.DBSCAN(eps=eps, min_samples=1).fit(distances)
    return clustering.labels_

## 5. Testy

<i> 5. Porównaj jakość wyników sposobami zaimplementowanymi w pkt. 2. </i>

In [168]:
def get_example_lines(num):
    with open('lines.txt', 'r') as file:
        lines = list(file)
    lines = lines[:num]

    with open('clusters.txt', 'r') as file:
        clusters = list(file)
    
    correct_clustering = []
    cluster_num = 0
    for line in lines:
        cluster = 0
        for cluster_line in clusters:
            if cluster_line[0] == '#':
                cluster += 1
            elif cluster_line == line:
                correct_clustering.append(cluster)
    
    return lines, correct_clustering

In [169]:
lines, correct_labels = get_example_lines(200)

In [170]:
labels = cluster(lines, euclides)

In [171]:
print(labels)

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199]


In [172]:
def renumber_labels(labels):
    seen = {}
    current = 0
    new_labels = []
    for label in labels:
        if label in seen:
            new_labels.append(seen[label])
        else:
            new_labels.append(current)
            seen[label] = current
            current += 1
    
    return new_labels      

In [173]:
def get_clusters_from_labels(lines, labels):
    clusters = {}
    for i, label in enumerate(labels):
        clusters[label] = clusters.get(label, []) + [lines[i]]
        
    return list(clusters.values())

In [174]:
print(renumber_labels(correct_labels))

[0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11, 11, 11, 12, 12, 13, 14, 14, 15, 16, 17, 18, 18, 19, 20, 21, 22, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 33, 34, 35, 36, 36, 37, 38, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 2, 60, 61, 62, 63, 64, 63, 64, 65, 65, 65, 65, 65, 66, 67, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 68, 69, 70, 70, 70, 71, 72, 73, 74, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 89, 90, 91, 27, 27, 26, 26, 26, 26, 26, 92, 93, 6, 6, 94, 53, 95, 27, 27, 26, 26, 96, 91, 91, 91, 91, 91, 91, 97, 98, 99, 99, 100, 99, 101, 102, 102, 102, 102, 102, 103, 102, 53, 104, 105, 105, 105, 106, 106, 106, 106, 106, 106, 107, 107, 93, 108, 109, 26, 110, 111]


In [175]:
clusters = get_clusters_from_labels(lines, labels)

In [176]:
print(clusters)

[['/11692589 RD TUNA CANNERS, LTD. PORTION 1004, SIAR NORTH COAST ROAD, P.O.BOX 2113, MADANG, PAPUA NEW GUINEA\n'], ["''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611\n"], ["''SSONTEX''  Sp.ZO.O.IMPORT-EXPORTUL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669\n"], ["''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--\n"], ["''TOPEX SP. Z O.O.'' SPOLKA KOMANDYTOWA UL. POGRANICZNA 2/4  02-285 WARSZAWA POLAND\n"], ["'MASTER PLUS CO.,LTD.' 143000,RUSSIA,MO,ODINSOVO, MOJAISKOE, SHOSSE,153G TEL:+7495 7273939\n"], ['"2TIGERS GROUP LIMITED"  ROOM 504 JINSHAZHOU SHANGSHUI ROAD,  GUANGZHOU 510160\n'], ['"ALDETRANS" LLC, 105066, MOSCOW, RUSSIA, TOKMAKOV LANE, 11. TEL:+7(495)641-03-89\n'], ['"A-LIFT",JSC 1 PROSPEKT MARSHALA ZHUKOVA,MOSCOW 123308,RUSSIA  T: +7(495)784-7961\n'], ['"ALISA" LTD, 1/5 Derbenevskaya str., Moscow, Russia Tel./Fax: (495) 987-13-07 p

In [209]:
print(davies_bouldin(clusters, euclides))

0.0


In [210]:
print(dunn(clusters, euclides))

ZeroDivisionError: float division by zero

In [202]:
def test(lines, metric, eps):
    print(metric.__name__)
    labels = cluster(lines, metric, eps)
#     print(labels)
    clusters = get_clusters_from_labels(lines, labels)
#     print_clustering(clusters)
    print('\tdavies-bouldin index: ', davies_bouldin(clusters, metric))
    print('\t          dunn index: ', dunn(clusters, metric))

In [203]:
metrics = [dice, levensheit, lcs, euclides]

In [204]:
def print_clustering(clusters):
    for cluster in clusters:
        print('##########')
        for line in cluster:
            print(line)

In [205]:
test(lines, euclides, 20)

euclides
	davies-bouldin index:  0.8636511180480038
	          dunn index:  0.40096450118311855


In [206]:
test(lines, dice, 0.5)

dice
	davies-bouldin index:  0.30896614847536097
	          dunn index:  0.5064456721915285


In [207]:
test(lines, lcs, 0.5)

lcs
	davies-bouldin index:  0.32026010636128127
	          dunn index:  0.7792207792207794


In [208]:
test(lines, levensheit, 10)

levensheit
	davies-bouldin index:  0.009609929166675748
	          dunn index:  1.0


<i>6. Czy masz jakiś pomysł na poprawę jakości klasteryzacji w tym zadaniu? </i>

Sprawozdanie powinno zawierać porównanie wyników wszystkich metryk z użyciem stoplisty i bez.
Można jako wzorcową klasteryzację użyć pliku clusters.txt.

## Wnioski

- skuteczność klasyfikacji zależą od doboru metryki i parametrów dobranych do tejże metryki
- 

M. Hawryluk 15.05.2021