In [1]:
!pip install iteround
!pip install pairing 
!pip install scikit-multilearn
!pip install arff
!pip install category_encoders
!pip install matplotlib
!pip install tensorflow
!pip install keras





In [2]:
## Basics
import os
import random
import pickle
import iteround
import itertools
import numpy as np
from scipy import special
from sklearn.cluster import KMeans, MiniBatchKMeans

class Hasher():
    random.seed(0)
    np.random.seed(0)
    os.environ['PYTHONHASHSEED']=str(0)
    """
    comb_algo : "rec", "prod"
    
    cluster_algo: "kmean", "mbkmean"
    """
    def __init__(self, comb_algo = "rec", cluster_algo= "kmean"):
        self.comb_algo = comb_algo
        self.cluster_algo = cluster_algo

    def combinations_prod(self, n, tot):
        #### src: https://codereview.stackexchange.com/questions/190122/permutations-with-a-sum-constraint
        def combinations_prod_inner(n, tot):
            items = list(range(0,tot+1,1))
            combinations = pd.DataFrame(list(filter(lambda x: np.sum(x)==tot, 
                                                  list(itertools.product(items, repeat=n)))))
            return(combinations.as_matrix())

        res = combinations_prod_inner(n, tot)
        res = np.array(res)
        res = res/res.sum(axis=1)[:, np.newaxis]
        return res


    def combinations_recursive(self, n, tot):
        #### src: https://codereview.stackexchange.com/questions/190122/permutations-with-a-sum-constraint
        def combinations_recursive_inner(n, buf, gaps, tsum, accum, tot):
            if gaps == 0:
                accum.append(list(buf))
            else:
                for x in range(0, tot+1):
                    if tsum + x + (gaps - 1) * tot < tot:
                        continue
                    if tsum + x > tot:
                        break
                    combinations_recursive_inner(n, buf + [x], gaps - 1, tsum + x, accum, tot)
        
        res = []
        combinations_recursive_inner(n, [], n, 0, res, tot)
        res = np.array(res)
        res = res/res.sum(axis=1)[:, np.newaxis]
        return res

    
    
    def build_hasher(self, context_size, bin_size, dec_digits=1, saving=True):
        ### src:https://en.wikipedia.org/wiki/Stars_and_bars_(combinatorics)
        num_tot_hsits = special.comb(((10**dec_digits)+context_size-1), context_size-1)
        print("Total number of possible contexts(histograms):", num_tot_hsits)
        if self.comb_algo == "rec":
            all_hists = self.combinations_recursive(context_size, 10**dec_digits)
        elif self.comb_algo == "prod":
            all_hists = self.combinations_prod(context_size, 10**dec_digits)
        
        #### To clustering histograms
        print("Clustering...")
        if self.cluster_algo == "kmean":
            kmeans = KMeans(n_clusters=2**bin_size,
                            n_jobs = -1,
                            random_state=0).fit(all_hists)
        elif self.cluster_algo == "mbkmean":
            kmeans = MiniBatchKMeans(n_clusters=2**bin_size, 
                                     batch_size = bin_size*bin_size,
                                     init_size=2**bin_size,
                                     n_init=bin_size,
                                     random_state=0).fit(all_hists)
        
        #### To order clustering labels from highest to lowest
        idx = np.argsort(kmeans.cluster_centers_.sum(axis=1))
        re_indexer = np.zeros_like(idx)
        re_indexer[idx] = np.arange(2**bin_size)
        
        if saving:
            print("Saving...")
            save_dir = "encoders_repo"
            f_name =  "hasher_"+str(context_size)+"_"+str(dec_digits)+"_"+str(bin_size)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            with open(save_dir+"/kmeans_"+f_name+".pkl", 'wb') as fid:
                pickle.dump(kmeans, fid)
            np.save(save_dir+"/re_indexer_"+f_name+".npy", re_indexer)
        
        print("Completed!")
        return kmeans, re_indexer
    
    def get_hasher(self, context_size, bin_size, dec_digits=1):
        save_dir = "encoders_repo"
        f_name =  "hasher_"+str(context_size)+"_"+str(dec_digits)+"_"+str(bin_size)
        with open(save_dir+"/kmeans_"+f_name+".pkl", 'rb') as fid:
            kmeans = pickle.load(fid)
        re_indexer = np.load(save_dir+"/re_indexer_"+f_name+".npy")
        
        return kmeans, re_indexer

In [3]:
## If you don't have a problem with computation resouces and want a bit more accurate encoding use "cluster_algo = kmean"
hasher = Hasher(comb_algo = "rec", cluster_algo= "mbkmean")
hasher.build_hasher(context_size=10,
                    bin_size=10,
                    dec_digits=1,
                    saving=True)
#hasher.get_hasher(context_size, bin_size,dec_digits)

Total number of possible contexts(histograms): 92378.0
Clustering...
Saving...
Completed!


(MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
                 init_size=1024, max_iter=100, max_no_improvement=10,
                 n_clusters=1024, n_init=10, random_state=0,
                 reassignment_ratio=0.01, tol=0.0, verbose=0),
 array([175, 664, 665, ..., 373, 829, 868]))

### What is the Next Step?

Build as much as **Encoder** you need for your desired `context_size`, `bin_size`, and `dec_digits`. Then you can run one of the following notebooks:
* 2_synthetic_exp.ipynb
* 2_mlc_exp.ipynb
* 2_criteo_exp.ipynb