In [None]:
!mkdir folds

In [None]:
import cPickle as pickle

In [None]:
from skmultilearn.dataset import load_from_arff, load_dataset_dump
import copy

In [None]:
import datetime

In [None]:
import numpy as np
from scipy.sparse import lil_matrix
from sklearn.model_selection import KFold, StratifiedKFold
import pandas as pd
import copy
from itertools import chain

In [None]:
from builtins import str
from builtins import range
from builtins import object
import arff
import bz2
import numpy as np
import os
import csv
import sys
import shutil
from os import environ
from os.path import dirname
from os.path import join
from os.path import exists
from os.path import expanduser
from os.path import isdir
from os.path import splitext
from os import listdir
from os import makedirs
from scipy import sparse


In [None]:
import jsonpickle

In [None]:
def tsoumakas_fold(n_splits, y):
    y_train = lil_matrix(y)
    n_samples = y_train.shape[0]
    n_labels = y_train.shape[1]
    percentage_per_fold = [1/float(n_splits) for i in range(n_splits)]
    desired_samples_per_fold = np.array([percentage_per_fold[i]*n_samples for i in range(n_splits)])

    folds = [[] for i in range(n_splits)]

    samples_with_label = [[] for i in range(n_labels)]

    for sample, labels in enumerate(y_train.rows):
        for label in labels:
            samples_with_label[label].append(sample)

    desired_samples_per_label_per_fold = {i: [len(samples_with_label[i])*percentage_per_fold[j] for j in range(n_splits)] for i in range(n_labels)}

    rows_used = {i : False for i in range(n_samples)}
    labeled_samples_available = map(len, samples_with_label)
    total_labeled_samples_available = sum(labeled_samples_available)
    while total_labeled_samples_available > 0:
        l = np.argmin(np.ma.masked_equal(labeled_samples_available, 0, copy=False))

        while len(samples_with_label[l])>0:
            row = samples_with_label[l].pop()
            if rows_used[row]:
                continue

            max_val = max(desired_samples_per_label_per_fold[l])
            M = np.where(np.array(desired_samples_per_label_per_fold[l])==max_val)[0]
            m = None
            if len(M) == 1:
                m = M[0]
            else:
                max_val = max(desired_samples_per_fold[M])
                M_prim = np.where(np.array(desired_samples_per_fold)==max_val)[0]
                M_prim = np.array([x for x in M_prim if x in M])
                m = np.random.choice(M_prim, 1)[0]

            folds[m].append(row)
            rows_used[row]=True
            for i in y_train.rows[row]:
                desired_samples_per_label_per_fold[i][m]-=1
            desired_samples_per_fold[m]-=1

        labeled_samples_available = map(len, samples_with_label)
        total_labeled_samples_available = sum(labeled_samples_available)

    available_samples = [i for i, v in rows_used.iteritems() if not v]
    samples_left = len(available_samples)

    assert (samples_left + sum(map(len, folds))) == n_samples

    while samples_left>0:
        row = available_samples.pop()
        rows_used[row]=True
        fold_selected = np.random.choice(np.where(desired_samples_per_fold>0)[0], 1)[0]
        folds[fold_selected].append(row)
        samples_left-=1

    assert sum(map(len, folds)) == n_samples
    assert len([i for i, v in rows_used.iteritems() if not v])==0
    return folds

In [None]:
def szymanski_ts_eq_fold(n_splits, y):

    y_train = lil_matrix(y)

    n_samples = y_train.shape[0]
    n_labels = y_train.shape[1]

    percentage_per_fold = [1/float(n_splits) for i in range(n_splits)]
    desired_samples_per_fold = np.array([percentage_per_fold[i]*n_samples for i in range(n_splits)])

    folds = [[] for i in range(n_splits)]

    samples_with_label = [[] for i in range(n_labels)]

    for sample, labels in enumerate(y_train.rows):
        for label in labels:
            samples_with_label[label].append(sample)

    samples_with_labelpairs = {}
    for row, labels in enumerate(y_train.rows):
        pairs = [(a, b) for b in labels for a in labels if a <= b]
        for p in pairs:
            if p not in samples_with_labelpairs:
                samples_with_labelpairs[p] = []
            samples_with_labelpairs[p].append(row)

    desired_samples_per_labelpair_per_fold = {k : [len(v)*i for i in percentage_per_fold] for k,v in samples_with_labelpairs.iteritems()}

    labels_of_edges = samples_with_labelpairs.keys()
    labeled_samples_available = [len(samples_with_labelpairs[v]) for v in labels_of_edges]

    rows_used = {i : False for i in range(n_samples)}
    total_labeled_samples_available = sum(labeled_samples_available)
    old_l=None

    while total_labeled_samples_available > 0:
        l = labels_of_edges[np.argmin(np.ma.masked_equal(labeled_samples_available, 0, copy=False))]


        while len(samples_with_labelpairs[l])>0:

            row = samples_with_labelpairs[l].pop()
            if rows_used[row]:
                continue

            max_val = max(desired_samples_per_labelpair_per_fold[l])
            M = np.where(np.array(desired_samples_per_labelpair_per_fold[l])==max_val)[0]
        #    print l, M, len(M)

            m = None
            if len(M) == 1:
                m = M[0]
            else:
                max_val = max(desired_samples_per_fold[M])
                M_bis = np.where(np.array(desired_samples_per_fold)==max_val)[0]
                M_bis = np.array([x for x in M_bis if x in M])
                m = np.random.choice(M_bis, 1)[0]
        #        print M_prim,m, max_val, desired_samples_per_labelpair_per_fold[l]

            folds[m].append(row)
            rows_used[row]=True
            desired_samples_per_labelpair_per_fold[l][m]-=1
            if desired_samples_per_labelpair_per_fold[l][m] <0:
                desired_samples_per_labelpair_per_fold[l][m]=0

            for i in samples_with_labelpairs.iterkeys():
                if row in samples_with_labelpairs[i]:
                    samples_with_labelpairs[i].remove(row)
                    desired_samples_per_labelpair_per_fold[i][m]-=1

                if desired_samples_per_labelpair_per_fold[i][m] <0:
                    desired_samples_per_labelpair_per_fold[i][m]=0
            desired_samples_per_fold[m]-=1

        labeled_samples_available = [len(samples_with_labelpairs[v]) for v in labels_of_edges]
        total_labeled_samples_available = sum(labeled_samples_available)

        available_samples = [i for i, v in rows_used.iteritems() if not v]
        samples_left = len(available_samples)

    labeled_samples_available = map(len, samples_with_label)
    total_labeled_samples_available = sum(labeled_samples_available)

    while total_labeled_samples_available > 0:
        l = np.argmin(np.ma.masked_equal(labeled_samples_available, 0, copy=False))

        while len(samples_with_label[l])>0:
            row = samples_with_label[l].pop()
            if rows_used[row]:
                continue

            max_val = max(desired_samples_per_label_per_fold[l])
            M = np.where(np.array(desired_samples_per_label_per_fold[l])==max_val)[0]
            m = None
            if len(M) == 1:
                m = M[0]
            else:
                max_val = max(desired_samples_per_fold[M])
                M_prim = np.where(np.array(desired_samples_per_fold)==max_val)[0]
                M_prim = np.array([x for x in M_prim if x in M])
                m = np.random.choice(M_prim, 1)[0]

            folds[m].append(row)
            rows_used[row]=True
            for i in y_train.rows[row]:
                desired_samples_per_label_per_fold[i][m]-=1
            desired_samples_per_fold[m]-=1

        labeled_samples_available = map(len, samples_with_label)
        total_labeled_samples_available = sum(labeled_samples_available)

    assert (samples_left + sum(map(len, folds))) == n_samples

    while samples_left>0:
        row = available_samples.pop()
        rows_used[row]=True
        fold_selected = np.random.choice(np.where(desired_samples_per_fold>0)[0], 1)[0]
        folds[fold_selected].append(row)
        samples_left-=1

    assert sum(map(len, folds)) == n_samples
    assert len([i for i, v in rows_used.iteritems() if not v])==0
    return folds




def standard_kfolds(n_splits, y):
    kf = KFold(n_splits, shuffle=False)
    folds3 = [x[1] for x in list(kf.split(np.zeros(y.shape[0]),y))]
    return folds3

class Transfomer:
    def transform_to_multiclass(self, y):
        self.label_count = y.shape[1]
        self.unique_combinations = {}
        self.reverse_combinations=[]
        self.last_id = 0
        train_vector = []
        for labels_applied in y.rows:
            label_string = ",".join(map(str, labels_applied))

            if label_string not in self.unique_combinations:
                self.unique_combinations[label_string] = self.last_id
                self.reverse_combinations.append(labels_applied)
                self.last_id += 1

            train_vector.append(self.unique_combinations[label_string])
        return train_vector
    
def stratified_folds(n_splits, y):
    t=Transfomer()
    kf = StratifiedKFold(n_splits=n_splits, random_state=None, shuffle=False)
    folds4 = [x[1] for x in list(kf.split(np.zeros(y.shape[0]),t.transform_to_multiclass(y)))]
    return folds4

In [None]:
sets = {
    'bibtex': 159,
    'Corel5k': 374,
    'delicious': 983,
    'genbase': 27,
    'emotions': 6,
    'enron': 53,
    'mediamill': 101,
    'medical': 45,
    'scene': 6,
    'tmc2007-500': 22,
    'yeast': 14,
    'rcv1subset1': 101,
    'rcv1subset2': 101,
    'rcv1subset3': 101,
    'rcv1subset4': 101,
    'rcv1subset5': 101,
}

In [None]:
def save_dataset_dump(filename, input_space, labels, feature_names, label_names):
        """Saves a compressed data set dump

        Parameters
        ----------

        filename : string
            Path to dump file, if without .bz2, the .bz2 extension will be appended.

        input_space: array-like of array-likes
            Input space array-like of input feature vectors

        labels: array-like of binary label vectors
            Array-like of labels assigned to each input vector, as a binary indicator vector (i.e. if 5th position has value 1
            then the input vector has label no. 5)
        """
        if filename[-4:] != '.bz2':
            filename += ".bz2"

        with bz2.BZ2File(filename, "wb") as file_handle:
            pickle.dump({'X': input_space, 'y': labels, 'features': feature_names, 'labels': label_names}, file_handle)

In [None]:
n_splits = 10
for s in sorted(sets.keys(), key=lambda x: sets[x]):
    print s, str(datetime.datetime.now())
    data = load_dataset_dump('./dumps/{}.scikitml.bz2'.format(s))
    y = data['y']

    folds = {
        'IS': tsoumakas_fold(n_splits, y),
        'kfold': standard_kfolds(n_splits, y),
        'SOIS': szymanski_ts_eq_fold(n_splits, y),
        'stratified': stratified_folds(n_splits, y)        
    }
    
    with open("./folds/{}.pickle".format(s),"w") as fp:
        pickle.dump(folds, fp)        