In [439]:
from newsgac import database
from newsgac.ace.models import ACE
from newsgac.pipelines.models import Pipeline
from newsgac.genres import genre_labels
from scipy.sparse import csr_matrix
import ipywidgets as widgets
import numpy as np
import re
import pprint

# sets number of positive/negative feature weights shown
show_number_of_features = 20

svc_pipelines = [p for p in Pipeline.objects.all() if re.search(r'svc|SVC|svm|SVM', p.display_title)]

pipeline_widget = widgets.Dropdown(
    options=[('Select...', None)] + [(p.display_title, p._id) for p in svc_pipelines],
    description='pipeline:',
    disabled=False
)

class_1_widget = widgets.RadioButtons(
    options=[],
    description='class 1:',
    disabled=False
)
    
class_2_widget = widgets.RadioButtons(
    options=[],
    description='class 2:',
    disabled=False
)

def sorted_genre_indices(pipeline):
    if not pipeline.result or len(pipeline.result.sorted_labels) == 0:
        # default fall back to all genres (and not unlabeled)
        return list(range(0, len(genre_labels)-1))
    else:
        return pipeline.result.sorted_labels

def get_coef_map(indices):
    # maps coef index to (class 1, class 2) tuple,
    # e.g. get_coef_map([1,4,7,8]) == {
    # 0: (1, 4), 
    # 1: (1, 7),
    # 2: (1, 8),
    # 3: (4, 7),
    # 4: (4, 8),
    # 5: (7, 8)}
    coef_map = {}
    index=0
    for i in range(0, len(indices) - 1):
        for j in range(i+1, len(indices)):
            coef_map[index] = (indices[i],indices[j])
            index += 1
    return coef_map    

def set_class_widget_options(pipeline):
    # e.g. [1,4,5]
    pipeline_sorted_genre_indices = sorted_genre_indices(pipeline)
    # e.g. ['nieuwsbericht', 'service', 'mededeling']
    pipeline_genre_labels = np.array(genre_labels)[pipeline_sorted_genre_indices]
    
    class_widget_options = list(zip(pipeline_genre_labels, pipeline_sorted_genre_indices))
    class_1_widget.options = class_widget_options
    class_2_widget.options = class_widget_options

    
def out(pipeline_id, class_1, class_2):
    if pipeline_id is None:
        class_1_widget.options = []
        class_2_widget.options = []
        return
    
    p = Pipeline.objects.get({'_id': pipeline_id})
    
    set_class_widget_options(p)
    
    if class_1 == class_2:
        print('Pick two different classes')
        return
    if class_2 < class_1:
        class_1, class_2 = class_2, class_1
    
    print("Showing %s vs %s for %s " % (genre_labels[class_1], genre_labels[class_2], p.display_title))
    pipeline_sorted_genre_indices = sorted_genre_indices(p)
    skp = p.sk_pipeline.get()
    classifier = skp.named_steps['Classifier']
    num_classes = len(classifier.classes_)
    
    if num_classes != len(pipeline_sorted_genre_indices):
        print("Error: num_classes vs result genre indices count mismatch: %s vs %s" % (num_classes, len(pipeline_sorted_genre_indices)))
        return
    
    coef_map = get_coef_map(pipeline_sorted_genre_indices)
    # coef_map inversed, so that you can lookup the coef index for class pair
    # e.g. coef_map_inverse[(1,7)] == 1
    coef_map_inverse = {v: k for k, v in coef_map.iteritems()}

    weights = classifier.coef_[coef_map_inverse[(
        class_1,
        class_2
    )]]
    
    if isinstance(weights, csr_matrix):
        weights = np.asarray(weights.todense())[0]
    
    top_10_pos_weight_indices = np.argsort(weights)[-show_number_of_features:][::-1]
    top_10_neg_weight_indices = np.argsort(weights)[:show_number_of_features]
    weights_pos = weights[top_10_pos_weight_indices]
    weights_neg = weights[top_10_neg_weight_indices]
    
    feature_names = np.array(skp.get_feature_names())
    print("Positive:")
    pprint.pprint(zip(
        ['%.2f' % w for w in weights_pos],
        feature_names[top_10_pos_weight_indices]
    ))
    print("Negative:")
    pprint.pprint(zip(
        ['%.2f' % w for w in weights_neg],
        feature_names[top_10_neg_weight_indices]
    ))
    
widgets.VBox([
    widgets.HBox([pipeline_widget, class_1_widget, class_2_widget]),
    widgets.interactive_output(out, dict(pipeline_id=pipeline_widget, class_1=class_1_widget, class_2=class_2_widget))
])

VkJveChjaGlsZHJlbj0oSEJveChjaGlsZHJlbj0oRHJvcGRvd24oZGVzY3JpcHRpb249dSdwaXBlbGluZTonLCBvcHRpb25zPSgoJ1NlbGVjdC4uLicsIE5vbmUpLCAodSdTVkMgRlJPRycsIE/igKY=


In [438]:
Pipeline.objects.get({'_id': pipeline_widget.value}).result.sorted_labels

[0, 7, 8]