# Zadanie 1


Dice Metric

In [131]:
def get_grams(t):
    grams = dict()

    for i in range(len(t) -1):
        gram = t[i:i+2]
        if gram in grams.keys():
            grams[gram] += 1
        else:
            grams[gram] = 1

    return grams


def dice_metric(t1,t2):

    t1_grams = get_grams(t1)
    t2_grams = get_grams(t2)

    out = set(t1_grams) & set(t2_grams)

    return 1 - 2*len(out) / (len(t1_grams)+len(t2_grams))


 Levenshtein Metric

In [132]:
def levenshtein_metric(t1,t2):

    x = len(t1) + 1
    y = len(t2) + 1

    distance = [[0 for i in range(y)] for j in range(x)]

    for i in range(1,x):
        distance[i][0] = i
    for i in range(1,y):
        distance[0][i] = i
    
    for i in range(1,x):
        for j in range(1,y):
            if t1[i-1] == t2[j-1]:
                op = 1
            else:
                op = 0
            distance[i][j] = min(distance[i - 1][j - 1] + op, distance[i - 1][j] + 1, distance[i][j - 1] + 1)
    
    return distance[x-1][y-1] / max(x-1, y-1)


LCS Metric

In [133]:
import numpy as np
def lcs_metric(t1,t2):

    n1 = len(t1) + 1
    n2 = len(t2) + 1

    grid = np.array([[None for i in range(n2)] for j in range(n1)])

    lcs = 0

    for i in range(n1):
        for j in range(n2):
            if i == 0 or j == 0:
                grid[i,j] = 0
            else:
                if t1[i-1] == t2[j-1]:
                    grid[i,j] = grid[i-1 , j-1] + 1
                else:
                    grid[i,j] = max(grid[i-1 , j] , grid[i][j-1])
            
            lcs = max(lcs , grid[i,j])
    
    return 1 - (lcs / max(n1-1, n2-1))


# Zadanie 2

In [134]:
import itertools


def davies_bouldin(clusters, metric):

    length = len(clusters)
    avg_distance =[]
    max_val = [0] * length

    for cluster in clusters:
        cluster.sort(key=lambda t1: sum([metric(t1, t2) for t2 in cluster if t1 != t2]))

    centroids = [cluster[len(cluster) // 2] for cluster in clusters]

    for cluster in clusters:

        n, s = len(cluster) , 0

        for t1, t2 in itertools.combinations(cluster, 2):

            s += metric(t1,t2)
        
        if n != 1:
            avg_distance.append(s/(n*(n - 1)/2))
        else:
            avg_distance.append(0)

    for i, cluster1 in enumerate(clusters):
        for j, cluster2 in enumerate(clusters):
            if i != j:
                try:
                    v = (avg_distance[i] + avg_distance[j]) / metric(centroids[i], centroids[j])
                except:
                    v = (avg_distance[i] + avg_distance[j])
    
                max_val[i] = max(v, max_val[i])

    return sum(max_val) / length

# Zadanie 3

In [135]:
from collections import defaultdict
def create_stoplist(txt):
    stoplist = defaultdict(int)

    for line in txt:
        words = line.split()
        for word in words:
            stoplist[word] += 1
    return sorted(stoplist.items(), key=lambda x:x[1] , reverse=True)




# Zadanie 4,5


In [136]:
def load_file(file , size):
        with open(file, "r", encoding="UTF-8") as f:
                text = f.read().splitlines()
        
        return text[:size]

In [141]:
def make_clusters(text , metric_function , threshold , stoplist = False):

    if stoplist:
        stplist = create_stoplist(text)
    
    clusters = []
    
    for line in text:
        flag = False
        for cluster in clusters:
            if any(metric_function(line, c_line) <= threshold for c_line in cluster):
                flag = True
                cluster.append(line)
                break
        if not flag:
            clusters.append([line])
    return clusters
    

In [138]:
import time

def compare():

    lines = load_file("lines.txt" , 100)

    print("PERFORMING TEST WITH 100 LINES AND THRESHOLD EQUAL TO 0.5")

    for i in range(2):
        print("-----------------------")
        if i == 1:
            print("Clustering with preprocessing")
        else:
            print("Clustering without preprocessing")

        print("-----------------------")

        start = time.perf_counter()

        dic_clust = make_clusters(lines , dice_metric , 0.5 , i)

        stop = time.perf_counter()

        print("Clusters with Dice Metric have been created | Time : " , stop-start ,"s")

        start = time.perf_counter()

        lev_clust = make_clusters(lines , levenshtein_metric , 0.5 , i)

        stop = time.perf_counter()

        print("Clusters with Levenshtein Metric have been created | Time : " , stop-start ,"s")

        start = time.perf_counter()

        lcs_clust = make_clusters(lines , lcs_metric , 0.5 , i)

        stop = time.perf_counter()

        print("Clusters with LCS Metric have been created | Time : " , stop-start ,"s")

        if i == 0:

            print("-----------------------")

            start = time.perf_counter()

            out = davies_bouldin(dic_clust , dice_metric)

            stop = time.perf_counter()

            print("Davies-Bouldin for Dice clusters has finished | Output : " , out , " | Time : " , stop-start ,"s")

            start = time.perf_counter()

            out = davies_bouldin(lev_clust , levenshtein_metric)

            stop = time.perf_counter()

            print("Davies-Bouldin for Levenshtein clusters has finished | Output : " , out , " | Time : " , stop-start ,"s")


            start = time.perf_counter()

            out = davies_bouldin(lcs_clust , lcs_metric)

            stop = time.perf_counter()

            print("Davies-Bouldin for LCS clusters has finished | Output : " , out , " | Time : " , stop-start ,"s")


        



In [140]:
compare()

PERFORMING TEST WITH 100 LINES AND THRESHOLD EQUAL TO 0.5
-----------------------
Clustering without preprocessing
-----------------------
Clusters with Dice Metric have been created | Time :  0.29460959997959435 s
Clusters with Levenshtein Metric have been created | Time :  0.9120264000084717 s
Clusters with LCS Metric have been created | Time :  42.844055800000206 s
-----------------------
Davies-Bouldin for Dice clusters has finished | Output :  0.8083526579597019  | Time :  0.2151241000101436 s
Davies-Bouldin for Levenshtein clusters has finished | Output :  0.2785258074974589  | Time :  31.53614270000253 s
Davies-Bouldin for LCS clusters has finished | Output :  0.7844105058434508  | Time :  37.419421200000215 s
-----------------------
Clustering with preprocessing
-----------------------
Clusters with Dice Metric have been created | Time :  0.2560842999955639 s
Clusters with Levenshtein Metric have been created | Time :  0.789765499997884 s
Clusters with LCS Metric have been crea

# Zadanie 6

W celu poprawy jakości klasteryzacji można wykorzystać dokładniejsze funkcje wbudowane metryk, znajdujące się w bibliotece sklearn.