In [None]:
from datetime import timedelta
import logging
import pickle

import numpy as np

from gensim.models import doc2vec

from crashsimilarity.downloader import SocorroDownloader
from crashsimilarity import utils
from crashsimilarity.models.gensim_model_wrapper import Doc2vecModelWrapper
from crashsimilarity.evaluator import BugzillaClusters

from crashsimilarity.models.similarity.doc2vec_similarity import Doc2VecSimilarity
from crashsimilarity.models.wmd_calculator import WMDCalculator
from crashsimilarity.models.similarity.base import GenericSimilarity
from crashsimilarity.models.distances import edit_distance_structural

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# download training data for last 30 days
RAW_DATA_DIR = 'crashsimilarity_data/raw'
SocorroDownloader.download_and_save_crashes(1, save_to_dir=RAW_DATA_DIR)

In [None]:
file_names = SocorroDownloader.get_dump_paths(30, data_dir=RAW_DATA_DIR)
data_gen = utils.StackTraceProcessor.process(utils.read_files(file_names), compress=True)
vocab = None
traces = []
signatures = []
for line in data_gen:
    vocab = line[0]
    traces.append(line[1][0])
    signatures.append(line[1][1])
len(vocab), len(traces), len(signatures)

In [None]:
pickle.dump(vocab, open('crashsimilarity_data/objects/vocab.pickle', 'wb'))

In [None]:
# convert vocabulary positions to words itself
for i in range(len(traces)):
    traces[i] = [str(x) for x in traces[i]]

In [None]:
# train model
corpus = [doc2vec.TaggedDocument(trace, [i]) for i, trace in enumerate(traces)]
model_wrapper = Doc2vecModelWrapper(corpus, None)
model_wrapper.train_model()
str(model_wrapper.model)

In [None]:
# save model
model_wrapper.save_model('dm_d200')

In [None]:
# download bugzilla clusters
# clusters = pickle.load(open('crashsimilarity_data/objects/bugzilla_clusters_2015-05-31_2016-05-31.pickle', 'rb'))
FROM_DATE = '2015-05-31'
TO_DATE = '2016-05-31'
clusters = BugzillaClusters.download_bugs(FROM_DATE, TO_DATE, ['id', 'cf_crash_signature'])
clusters.download_stack_traces(timedelta(days=360), True)

In [None]:
clusters.save('crashsimilarity_data/objects/')

In [None]:
list(clusters.stack_traces[1][0])

In [None]:
clear_groups = []
for group in clusters.stack_traces:
    g = [list(i) for i in group]
    g = [i[0] for i in g if i]
    tmp = [utils.StackTraceProcessor.preprocess(i) for i in g]
    clear_groups.append(tmp)

In [None]:
[len(i) for i in clear_groups]

In [None]:
compressed_corpus = []
corpus = []
group_indexes = []
i = 0
for group in clear_groups:
    for g in group:
        cmpr = [str(vocab.get(x, x)) for x in g]
        compressed_corpus.append(cmpr)
        corpus.append(g)
    idx = []
    for x in group:
        idx.append(i)
        i += 1
    group_indexes.append(idx)
len(compressed_corpus), len(corpus)

In [None]:
corpus[0]

In [None]:
compressed_corpus[0]

In [None]:
doc2vec_algo = Doc2VecSimilarity(WMDCalculator.build_with_all_distances(model_wrapper.model, compressed_corpus))

In [None]:
structural_dist_algo = GenericSimilarity(edit_distance_structural)

In [None]:
def top_similar_traces_no_logger(trace, corpus, top_n=10):
    logging.root.setLevel(logging.CRITICAL)
    result = doc2vec_algo.top_similar_traces(trace, corpus, top_n)
    logging.root.setLevel(logging.INFO)
    return result 

In [None]:
def mean_cluster_distance(trace_group, top_similar):
    rv = []
    for pos, (idx, dist) in enumerate(top_similar[1:]):
        if idx in trace_group:
            rv.append(pos)
    return np.mean(rv), rv

In [None]:
results_wmd = []
for cluster in group_indexes:
    cur = [] # mean of means
    for trace in cluster:
        top10 = top_similar_traces_no_logger(compressed_corpus[trace], compressed_corpus)
        mean, _ = mean_cluster_distance(cluster, top10)
        cur.append(mean)
    results_wmd.append(np.mean(cur))
len(results_wmd)

In [None]:
results_wmd

In [None]:
results_structural = []
for cluster in group_indexes:
    cur = [] # mean of means
    for trace in cluster:
        top10 = structural_dist_algo.top_similar_traces(corpus[trace], corpus, 10)
        mean, _ = mean_cluster_distance(cluster, top10)
        cur.append(mean)
    results_structural.append(np.mean(cur))
len(results_structural)

In [None]:
results_structural