In [None]:
import jsonpickle
import time, datetime

In [None]:
import uuid
from pylocker import Locker

In [None]:
from skmultilearn.dataset import load_from_arff, load_dataset_dump
import cPickle as pickle
import copy
from itertools import chain
import numpy as np

In [None]:
from builtins import range
from skmultilearn.cluster.base import LabelCooccurenceClustererBase
import numpy as np
import igraph as ig


class IGraphLabelCooccurenceClusterer(LabelCooccurenceClustererBase):

    """Clusters the label space using igraph community detection methods

    Parameters
    ----------

    method : enum from `IGraphLabelCooccurenceClusterer.METHODS`
        the igraph community detection method that will be used

    weighted: boolean
            Decide whether to generate a weighted or unweighted graph.

    include_self_edges : boolean
            Decide whether to include self-edge i.e. label 1 - label 1 in co-occurrence graph

    """

    METHODS = {
        'fastgreedy': lambda graph, w = None: graph.community_fastgreedy(weights=w).as_clustering(),
        'infomap': lambda graph, w = None: graph.community_infomap(edge_weights=w, trials=1000),
        'label_propagation': lambda graph, w = None: graph.community_label_propagation(weights=w),
        'walktrap': lambda graph, w = None: graph.community_walktrap(weights=w).as_clustering(),
    }

    def __init__(self, method=None, weighted=None, include_self_edges=None):
        super(IGraphLabelCooccurenceClusterer, self).__init__(
            weighted=weighted, include_self_edges=include_self_edges)
        self.method = method

        if method not in IGraphLabelCooccurenceClusterer.METHODS:
            raise ValueError(
                "{} not a supported igraph community detection method".format(method))

    def fit_predict(self, X, y):
        """Performs clustering on y and returns list of label lists

        Builds a label coocurence_graph using :func:`LabelCooccurenceClustererBase.generate_coocurence_adjacency_matrix` on `y` and then detects communities using a selected `method`.

        Parameters
        ----------
        X : sparse matrix (n_samples, n_features), feature space, not used in this clusterer
        y : sparse matrix (n_samples, n_labels), label space

        Returns
        -------
        partition: list of lists : list of lists label indexes, each sublist represents labels that are in that community


        """
        self.generate_coocurence_adjacency_matrix(y)

        if self.is_weighted:
            self.weights = dict(weight=list(self.edge_map.values()))
        else:
            self.weights = dict(weight=None)

        self.coocurence_graph = ig.Graph(
            edges=[x for x in self.edge_map],
            vertex_attrs=dict(name=list(range(1, self.label_count + 1))),
            edge_attrs=self.weights
        )

        self.partition = IGraphLabelCooccurenceClusterer.METHODS[
            self.method](self.coocurence_graph, self.weights['weight'])
        return np.array(self.partition)

In [None]:
from builtins import zip
from builtins import range
from skmultilearn.problem_transform.br import BinaryRelevance
import copy
import numpy as np

from scipy import sparse
from skmultilearn.utils import get_matrix_in_format


class LabelSpacePartitioningClassifier(BinaryRelevance):
    """Community detection base classifier

    Parameters
    ----------

    classifier : scikit classifier type
        The base classifier that will be used in a class, will be automagically put under self.classifier for future access.

    clusterer: an skmultilearn.cluster.base object that partitions the output space

    require_dense : [boolean, boolean]
        Whether the base classifier requires input as dense arrays, False by default for 

    """

    def __init__(self, classifier=None, clusterer=None, require_dense=None):
        super(LabelSpacePartitioningClassifier, self).__init__(
            classifier, require_dense)
        self.clusterer = clusterer
        self.copyable_attrs = ['clusterer', 'classifier', 'require_dense']

    def generate_partition(self, X, y):
        self.partition = self.clusterer.fit_predict(X, y)
        self.model_count = len(self.partition)
        self.label_count = y.shape[1]

        return self

    def predict(self, X):
        """Predict labels for X, see base method's documentation."""
        X = self.ensure_input_format(
            X, sparse_format='csr', enforce_sparse=True)
        result = sparse.lil_matrix((X.shape[0], self.label_count), dtype=int)

        for model in range(self.model_count):
            predictions = self.ensure_output_format(self.classifiers[model].predict(
                X), sparse_format=None, enforce_sparse=True).nonzero()
            for row, column in zip(predictions[0], predictions[1]):
                result[row, self.partition[model][column]] = 1

        return result

    def predict_proba(self, X):
        """Predict labels for X, see base method's documentation."""
        X = self.ensure_input_format(
            X, sparse_format='csr', enforce_sparse=True)
        result = sparse.lil_matrix((X.shape[0], self.label_count), dtype=float)

        for model in range(self.model_count):
            predictions = self.ensure_output_format(self.classifiers[model].predict_proba(
                X), sparse_format=None, enforce_sparse=True)
            for column, label in enumerate(self.partition[model]):
                result[:, label] = predictions[:, column]

        return result

In [None]:
sets = {
    'bibtex': 159,
    'Corel5k': 374,
    'delicious': 983,
    'genbase': 27,
    'emotions': 6,
    'enron': 53,
    'mediamill': 101,
    'medical': 45,
    'scene': 6,
    'tmc2007-500': 22,
    'yeast': 14,
    'rcv1subset1': 101,
    'rcv1subset2': 101,
    'rcv1subset3': 101,
    'rcv1subset4': 101,
    'rcv1subset5': 101,
}

In [None]:
# initialize the experiment
is_done = {s : [v , False] for s,v  in sets.iteritems()}
with open("./prediction_graphs.json", "w") as fp:
    fp.write(jsonpickle.dumps(is_done))

In [None]:
def get_me_set():
    #  create a unique lock pass. This can be any string.
    lpass = str(uuid.uuid1())

    # create locker instance
    FL = Locker(filePath="./prediction_graphs.json", lockPass=lpass,mode='r+')

    # acquire the lock
    with FL as r:
        acquired, code, fd  = r

        # check if aquired.
        if fd is not None:
            a = jsonpickle.loads(fd.read())
            s = filter(lambda z: a[z][1] is not True, sorted(a.keys(), key=lambda x: a[x][0]))
            if len(s) == 0:
                return None
            
            s=s[0]
            a[s][1]=True
            fd.seek(0)
            fd.write(jsonpickle.dumps(a))
            fd.truncate()
            return s

In [None]:
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import LabelPowerset

In [None]:
def load_set(s):
    data = load_dataset_dump('./dumps/{}.scikitml.bz2'.format(s))    

    with open("./folds/{}.pickle".format(s),"r") as fp:
        fold_data = pickle.load(fp)

    return data, fold_data

In [None]:
param_list_for_cluster = []
for method in ['fastgreedy']:
    for is_weighted in [True, False]:
        param_list_for_cluster.append((method, is_weighted))
param_list_for_cluster

In [None]:
def classify(s, param_list_for_cluster):
    n_splits = 10
    print s, n_splits, time.time()
    data, fold_data = load_set(s)
    X = data['X']
    y = data['y']

    label_count = y.shape[1]
    predictions = {p: {n : [None for i in range(n_splits)] for n in fold_data} for p in param_list_for_cluster}
    probs = {p: {n : [None for i in range(n_splits)] for n in fold_data} for p in param_list_for_cluster}
    times = {p: {name: [] for name in fold_data} for p in param_list_for_cluster}
    modularities = {p: {name: [] for name in fold_data} for p in param_list_for_cluster}
    partitions = {p: {name: [] for name in fold_data} for p in param_list_for_cluster}
    left = len(fold_data) * n_splits*len(param_list_for_cluster)
    for param_set in param_list_for_cluster:
        method, is_weighted = param_set
        for name, f in fold_data.iteritems():
            for split in range(n_splits):
                if len(f[split])==2:
                    train_idx = f[split][0]
                    test_idx = f[split][1]
                else:
                    train_idx = list(chain.from_iterable([f[i] for i in xrange(n_splits) if i!=split]))
                    test_idx=f[split]

                mean = np.mean([np.mean([np.mean(x) if len(x) > 0 else 0.0 for x in t.values()]) for t in times.values()])

                t = time.time()
                print s, name, split, method, is_weighted, str(datetime.datetime.fromtimestamp(t+mean)), str(datetime.datetime.fromtimestamp(t+left*mean))
                left -= 1

                clusterer = IGraphLabelCooccurenceClusterer(method, weighted=is_weighted, include_self_edges=False)

                # construct base forest classifier
                base_classifier = RandomForestClassifier(n_jobs=15)

                # setup problem transformation approach with sparse matrices for random forest
                pt_classifier = LabelPowerset(classifier=base_classifier,
                    require_dense=[False, True])


                # setup problem transformation approach with sparse matrices for random forest
                classifier = LabelSpacePartitioningClassifier(pt_classifier, clusterer)
                classifier.fit(X[train_idx,:], y[train_idx,:])
                
                modularities[param_set][name].append(classifier.clusterer.partition.modularity)
                partitions[param_set][name].append(copy.copy(classifier.clusterer.partition))

                predictions[param_set][name][split]= classifier.predict(X[test_idx,:])
                probs[param_set][name][split]= classifier.predict_proba(X[test_idx,:])
                t_end = time.time() - t
                times[param_set][name].append(t_end)

    with open ("./predictions/graphs/{}-{}-{}.pickle".format(s,method,is_weighted), "w") as fp:
        pickle.dump([predictions, probs, times, partitions, modularities], fp)

In [None]:
s = get_me_set()
while s is not None:
    classify(s,param_list_for_cluster)
    s = get_me_set()