In [2]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from tabulate import tabulate

pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 500)
# import sys
# sys.path.append('michael/deeplearn.det/deeplearn')  
# from model import evaluation_print
# from michael//deeplearn.det//deeplearn//model import evaluation_print

In [3]:
def string_preproc(a):
    """
       input string
       output string without not relevant stuff
       example: input "mailservice_nimol_server_qmgr.nimol.hist"
                output "mailservice\_server\_qmgr"

    """

    spl = a.split('_')
    return '\_'.join([spl[0], spl[2], spl[3].split('.')[0]])

def top_n_predicted(probs, true_labels, top_n_value):
    """
    input: probabilites per 'point', and true_labels, and top_n value
    output: labels for predictions. If the true probabilit are within top_n values
    it returns the true label in its output, otherwise it returns the argmax.
    """
    predicted_labels_top = []

    for line, true_label in zip(probs, true_labels):
        sorted_probs = np.argsort(line)[::-1][:top_n_value]
        if true_label in sorted_probs:
            predicted_labels_top.append(true_label)
        else:
            predicted_labels_top.append(sorted_probs[0])


    return predicted_labels_top

def top_n(vals, names, top_n_val):
    top_sorted_indexes = np.argsort(vals)[::-1][:top_n_val]
    return [names[i] for i in top_sorted_indexes]

def evaluation_print(true_labels, prediction, process_names, top_n_value=1):

#     if top_n > 1:
#         # print prediction
#         predicted_labels = top_n_predicted(prediction, true_labels, top_n_value=top_n_value)
#         # predicted_labels
#     elif top_n_value == 1:
#         predicted_labels = prediction
    predicted_labels = prediction
    np.set_printoptions(precision=6, suppress=True)
    np.set_printoptions(threshold=10000, linewidth=1000)
    conf_matrix = confusion_matrix(true_labels, predicted_labels)

    process_names = [string_preproc(nam) for nam in process_names]
    print process_names
    print 'top_n = ', top_n_value
    # print conf_matrix

    n = conf_matrix.shape[0]
    print pd.DataFrame(conf_matrix, index=range(n), columns=range(n))

    print 'macro average recall', '%.2f' % recall_score(true_labels, predicted_labels, average='macro')
    print 'macro average precision', '%.2f' % precision_score(true_labels, predicted_labels, average='macro')
    print '\\begin{tabular} {rrr}'
    print '\\hline'
    for i, (name, recall, precision) in enumerate(zip(
                            process_names,
                            recall_score(true_labels, predicted_labels, average=None),
                            precision_score(true_labels, predicted_labels, average=None)
                                      )):
        print i, name, '&', '%.4f' % recall, '&', '%.4f' % precision, '\\\\'
        # print name, '%.4f' % recall, '%.4f' % precision

    print '\\hline'
    print '\\end{tabular}'

    print '\n' * 2
    conf_matrix = np.hstack([np.array(range(n), ndmin=2).T, conf_matrix])
    conf_matrix = np.vstack([[0] + range(n), conf_matrix])
    print(tabulate(conf_matrix, tablefmt="latex"))



In [8]:
%ls data/

data_docker.pkl     data_prod.pkl       data_prod_test.pkl


In [9]:
%time data, proc_names = joblib.load('data/data_docker.pkl') 

CPU times: user 3.22 s, sys: 405 ms, total: 3.62 s
Wall time: 3.62 s


In [10]:
data_blocks = [d.T for _, d in data]
labels = [np.argmax(l) for l, _ in data]
labels = [[l] * 10 for l in labels]
labels = [item for sublist in labels for item in sublist]

In [11]:
data_sum = [d.sum(axis=1) for d in data_blocks]
data_concat = [d.ravel() for d in data_blocks]
data_lines = np.vstack(data_blocks)

In [12]:
data_lines.shape

(364500, 310)

In [13]:
# train_data, test_data, train_label, test_label = train_test_split(data_sum, labels, test_size=.2, random_state=100)
train_data, test_data, train_label, test_label = train_test_split(data_lines, labels, test_size=.2, random_state=100)

In [14]:
len(test_label)

72900

In [31]:
r, p = [], []
for i in range(10):
#     clf = LogisticRegression(n_jobs=8)
    clf = LinearSVC(C=1)
#     clf = RandomForestClassifier()

    clf.fit(train_data, train_label)
    prediction = clf.predict(test_data)
    # print confusion_matrix(prediction, test_label)
    r.append(recall_score(test_label, prediction, average='macro'))
    p.append(precision_score(test_label, prediction, average='macro'))
    print i

0
1
2
3
4
5
6
7
8
9


In [32]:
np.set_printoptions(suppress=True, precision=3)
print '%.3f' % np.mean(p), np.std(p), '%.3f' % np.mean(r), np.std(r)


0.850 1.11022302463e-16 0.827 1.11022302463e-16
