In [None]:
import jsonpickle
import time, datetime

In [None]:
import uuid
from pylocker import Locker

In [None]:
from skmultilearn.dataset import load_from_arff, load_dataset_dump
import cPickle as pickle
import copy
from itertools import chain
import numpy as np

In [None]:
sets = {
    'bibtex': 159,
    'Corel5k': 374,
    'delicious': 983,
    'genbase': 27,
    'emotions': 6,
    'enron': 53,
    'mediamill': 101,
    'medical': 45,
    'scene': 6,
    'tmc2007-500': 22,
    'yeast': 14,
    'rcv1subset1': 101,
    'rcv1subset2': 101,
    'rcv1subset3': 101,
    'rcv1subset4': 101,
    'rcv1subset5': 101,
}

In [None]:
# initialize the experiment
is_done = {s : [v , False] for s,v  in sets.iteritems()}
with open("./prediction_br_probs.json", "w") as fp:
    fp.write(jsonpickle.dumps(is_done))

In [None]:
def get_me_set():
    #  create a unique lock pass. This can be any string.
    lpass = str(uuid.uuid1())

    # create locker instance
    FL = Locker(filePath="./prediction_br.json", lockPass=lpass,mode='r+')

    # acquire the lock
    with FL as r:
        acquired, code, fd  = r

        # check if aquired.
        if fd is not None:
            a = jsonpickle.loads(fd.read())
            s = filter(lambda z: a[z][1] is not True, sorted(a.keys(), key=lambda x: a[x][0]))
            if len(s) == 0:
                return None
            
            s=s[0]
            a[s][1]=True
            fd.seek(0)
            fd.write(jsonpickle.dumps(a))
            fd.truncate()
            return s

In [None]:
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.cluster import IGraphLabelCooccurenceClusterer
from skmultilearn.ensemble import LabelSpacePartitioningClassifier


In [None]:
from builtins import range
from skmultilearn.base.problem_transformation import ProblemTransformationBase
from scipy.sparse import hstack, coo_matrix, csc_matrix
from sklearn.utils import check_array
import copy


class BinaryRelevance(ProblemTransformationBase):

    """Binary Relevance Multi-Label Classifier.

    Transforms a multi-label classification problem with L labels
    into L single-label separate binary classification problems
    using the same base classifier provided in the constructor. The
    prediction output is the union of all per label classifiers.

    :param classifier: clonable scikit-compatible base classifier
    :type classifier: :py:class:`sklearn.base.BaseEstimator` or compatible

    :param require_dense: whether the base classifier requires dense
        representations for input features and classes/labels matrices in fit/predict.
    :type require_dense: [bool, bool]

    """

    BRIEFNAME = "BR"

    def __init__(self, classifier=None, require_dense=None):
        super(BinaryRelevance, self).__init__(classifier, require_dense)

    def generate_partition(self, X, y):
        """ Partitions the label space into singletons

            :param X: not used
            :param y: binary indicator matrix with label assignments -
                only used for learning # of labels
            :type y: matrix or sparse matrix

            Sets self.partition (list of single item lists) and self.model_count (equal to number of labels)

        """
        self.partition = list(range(y.shape[1]))
        self.model_count = y.shape[1]

    def fit(self, X, y):
        """Fit classifier with training data

        Internally this method uses a sparse CSR representation for X
        (:py:class:`scipy.sparse.csr_matrix`) and sparse CSC representation for y
        (:py:class:`scipy.sparse.csc_matrix`).

        :param X: input features
        :type X: dense or sparse matrix (n_samples, n_features)
        :param y: binary indicator matrix with label assignments
        :type y: dense or sparse matrix of {0, 1} (n_samples, n_labels)
        :returns: Fitted instance of self

        """
        X = self.ensure_input_format(
            X, sparse_format='csr', enforce_sparse=True)
        y = self.ensure_output_format(
            y, sparse_format='csc', enforce_sparse=True)

        self.generate_partition(X, y)
        self.classifiers = []

        for i in range(self.model_count):
            classifier = copy.deepcopy(self.classifier)
            y_subset = self.generate_data_subset(y, self.partition[i], axis=1)
            classifier.fit(self.ensure_input_format(
                X), self.ensure_output_format(y_subset))
            self.classifiers.append(classifier)

        return self

    def predict(self, X):
        """Predict labels for X

        Internally this method uses a sparse CSR representation for X
        (:py:class:`scipy.sparse.coo_matrix`).

        :param X: input features
        :type X: dense or sparse matrix (n_samples, n_features)
        :returns: binary indicator matrix with label assignments
        :rtype: sparse matrix of int (n_samples, n_labels)

        """
        predictions = [self.ensure_multi_label_from_single_class(
            self.classifiers[label].predict(self.ensure_input_format(X)))
            for label in range(self.model_count)]

        return hstack(predictions)

    def predict_proba(self, X):
        """Predict probabilities of label assignments for X

        Internally this method uses a sparse CSR representation for X
        (:py:class:`scipy.sparse.coo_matrix`).

        :param X: input features
        :type X: dense or sparse matrix (n_samples, n_labels)
        :returns: matrix with label assignment probabilities
        :rtype: sparse matrix of float (n_samples, n_labels)

        """
        
        predictions = []
        for label in range(self.model_count):
            binary_prediction = self.classifiers[label].predict_proba(self.ensure_input_format(X))
            if binary_prediction.shape[1] == 2:
                binary_prediction=self.ensure_multi_label_from_single_class(binary_prediction[:, 1])
            elif binary_prediction.shape[1] == 1:
                if self.classifiers[label].classes_[0]==1:
                    binary_prediction = np.matrix(np.ones(X.shape[0])).T
                else:
                    binary_prediction = csc_matrix((X.shape[0],1))
            else:
                raise Exception("Invalid shape of binary prediction")

            predictions.append(binary_prediction)

        return hstack(predictions)

In [None]:
def load_set(s):
    data = load_dataset_dump('./dumps/{}.scikitml.bz2'.format(s))    

    with open("./folds/{}.pickle".format(s),"r") as fp:
        fold_data = pickle.load(fp)

    return data, fold_data

In [None]:
def classify(s):
    n_splits = 10
    print s, n_splits, time.time()
    data, fold_data = load_set(s)
    X = data['X']
    y = data['y']

    label_count = y.shape[1]
    predictions = {n : [None for i in range(n_splits)] for n in fold_data}
    probs = {n : [None for i in range(n_splits)] for n in fold_data}
    times = {name: [] for name in fold_data}
    left = len(fold_data) * n_splits
    for name, f in fold_data.iteritems():
        for split in range(n_splits):
            mean = np.mean([np.mean(x) if len(x) > 0 else 0.0 for x in times.values()])

            t = time.time()
            print s, name, split, str(datetime.datetime.fromtimestamp(t+mean)), str(datetime.datetime.fromtimestamp(t+left*mean))
            left -= 1
            
            if len(f[split])==2:
                train_idx = f[split][0]
                test_idx = f[split][1]
            else:
                train_idx = list(chain.from_iterable([f[i] for i in xrange(n_splits) if i!=split]))
                test_idx=f[split]
                
            # assifier = LabelSpacePartitioningClassifier(problem_transform_classifier, clusterer)
            # construct base forest classifier
            base_classifier = RandomForestClassifier()

            # setup problem transformation approach with sparse matrices for random forest
            classifier = BinaryRelevance(classifier=base_classifier,
                require_dense=[False, True])
            classifier.fit(X[train_idx,:], y[train_idx,:])

            predictions[name][split]= classifier.predict(X[test_idx,:])
            probs[name][split]= classifier.predict_proba(X[test_idx,:])
            t_end = time.time() - t
            times[name].append(t_end)

    with open ("./predictions/br/{}.pickle".format(s), "w") as fp:
        pickle.dump([predictions, probs, times], fp)

In [None]:
s = get_me_set()
while s is not None:
    classify(s)
    s = get_me_set()