In [None]:
import jsonpickle
import time, datetime

In [None]:
import jsonpickle.ext.numpy as jsonpickle_numpy
jsonpickle_numpy.register_handlers()

In [None]:
from skmultilearn.dataset import load_from_arff, load_dataset_dump
import cPickle as pickle
import copy
from itertools import chain
import numpy as np
import sklearn.metrics as metrics

In [None]:
sets = {
    'bibtex': 159,
    'Corel5k': 374,
    'delicious': 983,
    'genbase': 27,
    'emotions': 6,
    'enron': 53,
    'mediamill': 101,
    'medical': 45,
    'scene': 6,
    'tmc2007-500': 22,
    'yeast': 14,
    'rcv1subset1': 101,
    'rcv1subset2': 101,
    'rcv1subset3': 101,
    'rcv1subset4': 101,
    'rcv1subset5': 101,
}

In [None]:
classifiermetrics = {
    'precision-macro': lambda x,y: metrics.precision_score(x,y,average='macro'),
    'hamming_loss': metrics.hamming_loss,
    'accuracy_score': metrics.accuracy_score,    
}

probsmetrics = {
    'coverage_error': metrics.coverage_error,
    'label_ranking_loss': metrics.label_ranking_loss,
    'roc_auc-micro': lambda x, y: metrics.roc_auc_score(x,y, average='micro'),
}

In [None]:
def load_set(s):
    data = load_dataset_dump('./dumps/{}.scikitml.bz2'.format(s))    

    with open("./folds/{}.pickle".format(s),"r") as fp:
        fold_data = pickle.load(fp)

    return data, fold_data

In [None]:
def measure(s, measures, probsmeasures, source):
    n_splits = 10
    print s, n_splits, time.time()
    data, fold_data = load_set(s)
    X = data['X']
    y = data['y']

    with open ("./predictions/{}/{}.pickle".format(source,s), "r") as fp:
        d = pickle.load(fp)
        
    predictions = d[0]
    probs = d[1]
    
    label_count = y.shape[1]
    
    results = {m: {n:[] for n in fold_data} for m in measures.keys()+probsmeasures.keys()}
    
    for name, f in fold_data.iteritems():
        for split in range(n_splits):
            if len(f[split])==2:
                train_idx = f[split][0]
                test_idx = f[split][1]
            else:
                train_idx = list(chain.from_iterable([f[i] for i in xrange(n_splits) if i!=split]))
                test_idx=f[split]
                
            for m,fun in measures.iteritems():
                results[m][name].append(fun(y[test_idx,:].todense(), predictions[name][split].todense()))
                
            for m,fun in probsmeasures.iteritems():
                results[m][name].append(fun(y[test_idx,:].todense(), probs[name][split].todense()))
    return results

In [None]:
data = {src : {s: measure(s, classifiermetrics, probsmetrics, src) for s in sets} for src in ['br', 'lp']}

In [None]:
from skmultilearn.cluster import IGraphLabelCooccurenceClusterer
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

In [None]:
def measure_graph(s, measures, probsmeasures):
    n_splits = 10
    print s, n_splits, time.time()
    data, fold_data = load_set(s)
    X = data['X']
    y = data['y']

    with open ("./predictions/graphs/{}.pickle".format(s), "r") as fp:
        d = pickle.load(fp)
        
    results = {}
    modularities = {}
    communities = {}
    test_mods = {}
    test_parts = {}
    
    for graph_method in [('fastgreedy', True), ('fastgreedy', False)]:
        predictions = d[0][graph_method]
        probs = d[1][graph_method]
        m_name = "FG"
        if graph_method[1]:
            m_name+='W'

        label_count = y.shape[1]

        results[m_name] = {m: {n:[] for n in fold_data} for m in measures.keys()+probsmeasures.keys()}
        
        communities[m_name]={k:map(list,v) for k,v in d[3][graph_method].iteritems()}
        modularities[m_name]=copy.copy(d[4][graph_method])
        test_parts[m_name] =  {n:[] for n in fold_data}
        test_mods[m_name] =  {n:[] for n in fold_data}
        
        for name, f in fold_data.iteritems():
            for split in range(n_splits):
                if len(f[split])==2:
                    train_idx = f[split][0]
                    test_idx = f[split][1]
                else:
                    train_idx = list(chain.from_iterable([f[i] for i in xrange(n_splits) if i!=split]))
                    test_idx=f[split]

                for m,fun in measures.iteritems():
                    results[m_name][m][name].append(fun(y[test_idx,:].todense(), predictions[name][split].todense()))

                for m,fun in probsmeasures.iteritems():
                    results[m_name][m][name].append(fun(y[test_idx,:].todense(), probs[name][split].todense()))
                    
                clusterer = IGraphLabelCooccurenceClusterer(graph_method[0], weighted=graph_method[1], include_self_edges=False)
                clusterer.fit_predict(None, y[test_idx,:])
                test_mods[m_name][name].append(clusterer.partition.modularity)
                test_parts[m_name][name].append(copy.copy(list(clusterer.partition)))
                
    return results, communities, modularities, test_mods, test_parts

In [None]:
r_graph = {s: measure_graph(s, classifiermetrics, probsmetrics) for s in sets if s != 'delicious'} 

In [None]:
set(r_graph.keys())

In [None]:
available_network_methods = r_graph['scene'][0].keys()

In [None]:
r_graph['scene'][0]['FG'].keys() == data['br']['scene'].keys()

In [None]:
for m in available_network_methods:
    print m
    data[m] = {s: r_graph[s][0][m] for s in r_graph}

In [None]:
!mkdir results

In [None]:
with open("./results/classification.json","wb") as fp:
    fp.write(jsonpickle.dumps(data))

In [None]:
graph_data = ["train_communities", "train_modularities", "test_modularities", "test_communities"]

In [None]:
graph_data_dict = {s : {v : r_graph[s][k+1] for k, v in enumerate(graph_data)} for s in r_graph}

In [None]:
with open("./results/networks.json","wb") as fp:
    fp.write(jsonpickle.dumps(graph_data_dict))