In [None]:
import logging
import pickle
import time

import numpy as np

import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN

from gensim.models import doc2vec

from crashsimilarity import utils
from crashsimilarity.models.gensim_model_wrapper import Doc2vecModelWrapper
from crashsimilarity.evaluator import BugzillaClusters, Metrics as metrics

from crashsimilarity.models.similarity.doc2vec_similarity import Doc2VecSimilarity
from crashsimilarity.models.wmd_calculator import WMDCalculator
from crashsimilarity.models.similarity.base import GenericSimilarity
from crashsimilarity.models.distances import edit_distance_structural

In [None]:
vocab = pickle.load(open('crashsimilarity_data/objects/vocab.pickle', 'rb'))

In [None]:
model = Doc2vecModelWrapper.load_model('dm_d200')
str(model)

In [None]:
clusters = pickle.load(open('crashsimilarity_data/objects/bugzilla_clusters_2015-05-31_2016-05-31.pickle', 'rb'))

In [None]:
def compress_and_group(prepared):
    groups = []
    corpus = []
    true_labels = []
    for p in prepared:
        group = []
        for t in p:
            corpus.append(t)
            group.append(len(corpus)-1)
            true_labels.append(len(groups))
        groups.append(group)
    compressed_corpus = [[str(vocab.get(i, i)) for i in c] for c in corpus]
    compressed_corpus = [doc2vec.TaggedDocument(trace, [i]) for i, trace in enumerate(compressed_corpus)]
    groups = [g for g in groups if len(g) > 1]
    return compressed_corpus, corpus, groups, true_labels

In [None]:
clear_groups = []
for group in clusters.stack_traces:
    g = [list(i) for i in group]
    g = [i[0] for i in g if i]
    tmp = [utils.StackTraceProcessor.preprocess(i) for i in g]
    clear_groups.append(tmp)
compressed_corpus = []
corpus = []
group_indexes = []
true_labels = []
i = 0
for group in clear_groups:
    for g in group:
        cmpr = [str(vocab.get(x, x)) for x in g]
        compressed_corpus.append(cmpr)
        corpus.append(g)
    idx = []
    for x in group:
        true_labels.append(len(group_indexes))
        idx.append(i)
        i += 1
    group_indexes.append(idx)
len(compressed_corpus), len(corpus)

In [None]:
def distance_matrix(corpus, calculator, prog=10):
    dist = np.zeros((len(corpus), len(corpus)), dtype=np.double)
    idx = []
    for i in range(len(corpus)):
        for j in range(i + 1, len(corpus)):
            idx.append((i, j))
    say = len(idx) // prog
    t = time.time()
    for s, (i, j) in enumerate(idx):
        if s and s % say == 0:
            print('{}%, {} s.'.format(s / (len(idx) * 0.01), time.time() - t))
        dist[i, j] = dist[j, i] = calculator(corpus[i], corpus[j])
    return dist

In [None]:
wmd_calculator = WMDCalculator.build_with_all_distances(model, compressed_corpus)

In [None]:
wm_distances = distance_matrix(compressed_corpus, wmd_calculator.wmdistance)
wm_distances[wm_distances == np.inf] = -1  #remove inf values
m = np.max(wm_distances)
wm_distances[wm_distances == -1] = m
wm_distances.shape

In [None]:
struct_distances = distance_matrix(corpus, edit_distance_structural)
struct_distances.shape

In [None]:
def dbscan(dist, eps=0.5, min_samples=2):
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed').fit(dist)
    n_clusters = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)
    return db.labels_, n_clusters

In [None]:
def report(labels_true, labels_pred):
    return {'precision': metrics.precision(labels_true, labels_pred), 
            'recall': metrics.recall(labels_true, labels_pred),
            'FMI': metrics.FMI(labels_true, labels_pred),
            'n_clusters': len(set(labels_pred)) - (1 if -1 in labels_pred else 0),
            'noise': float(len([i for i in labels_pred if i == -1])) / len(labels_true)
           }


def iterate_eps(dist, labels_true, min_samples=2, eps=None, steps=None):
    if eps:
        if not isinstance(eps, list):
            eps = [eps]
    else:
        max_dist = np.max(dist)
        min_dist = np.min(dist)
        steps = steps or 100
        eps = np.linspace(min_dist, max_dist, steps)
    results = []
    for e in eps:
        try:
            labels_pred, *_ = dbscan(dist, e, min_samples)
            results.append((e, report(labels_true, labels_pred)))
        except ValueError:
            pass
    return results

In [None]:
wmd_results = iterate_eps(wm_distances, true_labels)
sorted(wmd_results, key=lambda x:x[1]['FMI'], reverse=True)[0]

In [None]:
struct_distances_results = iterate_eps(struct_distances, true_labels)
sorted(struct_distances_results, key=lambda x:x[1]['FMI'], reverse=True)[0]