In [1]:
import jsonpickle
import time, datetime

In [2]:
import uuid
from pylocker import Locker

In [3]:
from skmultilearn.dataset import load_from_arff, load_dataset_dump
import cPickle as pickle
import copy
from itertools import chain
import numpy as np

In [5]:
sets = {
    'bibtex': 159,
    'Corel5k': 374,
    'delicious': 983,
    'genbase': 27,
    'emotions': 6,
    'enron': 53,
    'mediamill': 101,
    'medical': 45,
    'scene': 6,
    'tmc2007-500': 22,
    'yeast': 14,
    'rcv1subset1': 101,
    'rcv1subset2': 101,
    'rcv1subset3': 101,
    'rcv1subset4': 101,
    'rcv1subset5': 101,
}

In [15]:
# initialize the experiment
is_done = {s : [v , False] for s,v  in sets.iteritems()}
with open("./prediction_lp.json", "w") as fp:
    fp.write(jsonpickle.dumps(is_done))

In [16]:
def get_me_set():
    #  create a unique lock pass. This can be any string.
    lpass = str(uuid.uuid1())

    # create locker instance
    #CHANGE
    FL = Locker(filePath="./prediction_lp.json", lockPass=lpass,mode='r+')

    # acquire the lock
    with FL as r:
        acquired, code, fd  = r

        # check if aquired.
        if fd is not None:
            a = jsonpickle.loads(fd.read())
            s = filter(lambda z: a[z][1] is not True, sorted(a.keys(), key=lambda x: a[x][0]))
            if len(s) == 0:
                return None
            
            s=s[0]
            a[s][1]=True
            fd.seek(0)
            fd.write(jsonpickle.dumps(a))
            fd.truncate()
            return s

In [17]:
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance
from skmultilearn.cluster import IGraphLabelCooccurenceClusterer
from skmultilearn.ensemble import LabelSpacePartitioningClassifier



# partition the label space using fastgreedy community detection
# on a weighted label co-occurrence graph with self-loops allowed
#clusterer = IGraphLabelCooccurenceClusterer('fastgreedy', weighted=True,
#    include_self_edges=True)

In [18]:
def load_set(s):
    data = load_dataset_dump('./dumps/{}.scikitml.bz2'.format(s))    

    with open("./folds/{}.pickle".format(s),"r") as fp:
        fold_data = pickle.load(fp)

    return data, fold_data

In [12]:
def classify(s):
    n_splits = 10
    print s, n_splits, time.time()
    data, fold_data = load_set(s)
    X = data['X']
    y = data['y']

    label_count = y.shape[1]
    predictions = {n : [None for i in range(n_splits)] for n in fold_data}
    probs = {n : [None for i in range(n_splits)] for n in fold_data}
    times = {name: [] for name in fold_data}
    left = len(fold_data) * n_splits
    for name, f in fold_data.iteritems():
        for split in range(n_splits):
            mean = np.mean([np.mean(x) if len(x) > 0 else 0.0 for x in times.values()])

            t = time.time()
            print s, name, split, str(datetime.datetime.fromtimestamp(t+mean)), str(datetime.datetime.fromtimestamp(t+left*mean))
            left -= 1
            
            if len(f[split])==2:
                train_idx = f[split][0]
                test_idx = f[split][1]
            else:
                train_idx = list(chain.from_iterable([f[i] for i in xrange(n_splits) if i!=split]))
                test_idx=f[split]
                
            # assifier = LabelSpacePartitioningClassifier(problem_transform_classifier, clusterer)
            # construct base forest classifier
            base_classifier = RandomForestClassifier(n_jobs=10)

            # setup problem transformation approach with sparse matrices for random forest
            classifier = LabelPowerset(classifier=base_classifier,
                require_dense=[False, True])
            classifier.fit(X[train_idx,:], y[train_idx,:])

            predictions[name][split]= classifier.predict(X[test_idx,:])
            probs[name][split]= classifier.predict_proba(X[test_idx,:])
            t_end = time.time() - t
            times[name].append(t_end)

    with open ("./predictions/lp/{}.pickle".format(s), "w") as fp:
        pickle.dump([predictions, probs, times], fp)

In [None]:
s = get_me_set()
while s is not None:
    classify(s)
    s = get_me_set()