In [1]:
import csv
import numpy as np
from scipy.stats import rankdata
from pandas import read_csv, DataFrame
from itertools import product, chain, starmap, combinations, combinations_with_replacement
from time import time

time0 = time()

k = 2
divisions = 1
range_ = 0.00
seed = 123

# 1. Function definitions

def discretize(seq, divisions=divisions, range_=range_, seed=seed):
    '''
    >>> discretize([3, 4, 1, 8, 13, 8], divisions=4, range_=0, seed=123) = array([1, 1, 0, 2, 3, 2])
    where
    ranks = [2., 3., 1., 4.5, 6., 4.5]
    tresholds = [1.5,  3.,  4.5]
    '''
    np.random.seed(seed)
    ranks = rankdata(seq, method='ordinal') # method='ordinal'/'average' ?
    
    random_blocks = np.cumsum(range_ * (2 * np.random.random(divisions + 1) - 1) + np.ones(divisions + 1))
    tresholds = random_blocks[:-1] / random_blocks[-1] * len(seq)

    discrete_seq = np.zeros(len(seq), dtype='float64')
    for treshold in tresholds:
        discrete_seq[ranks > treshold] += 1
    return discrete_seq

discretize_vec = np.vectorize(discretize, signature='(n)->(n)', excluded=['divisions', 'range_', 'seed'])

# 2. Read the data

file = "my_df_2.csv"
data = []
with open(file) as csvfile:
    reader = csv.reader(csvfile, delimiter=',',
                        quoting=csv.QUOTE_NONNUMERIC)
    for row in reader:
        data.append(row)
        
data = np.array(data, dtype='float64').T[:-1]
data[:-1] = discretize_vec(data[:-1])
data = data.astype('int64')

labels, counts = np.unique(data[-1], return_counts=True)
n_classes = len(labels)

xi = 1e-5
label_counts = {int(label): label_count for (label, label_count) in zip(labels, counts)}
min_count = np.min(counts)

dim0, dim1 = data[:-1].shape 

# 3. More function definitions

def jobs_generator(k=k, dim0=dim0):
    '''
    Python-generator.
    E.g. output for k=2:
    {0,1}, {0,2}, ..., {0, dim0-1}, {1,2}, ..., {1,dim0-1}, ..., {dim0-2, dim0-1}
    Go with combinations(range(M), k) to exclude diagonal tuples
    '''        
    return combinations(range(dim0), k)    

def neg_H(p):
    return p * np.log2(p)

def neg_H_cond(matrix):
    return np.sum(neg_H(matrix)) - np.sum(neg_H(np.sum(matrix, axis=-1)))

def work(indeces):
    '''
    Work-function.
    Output: {indexA: (number, list-of-indeces), indexB: ..., ...}
    indeces -> tuple
    '''
    # contingency-matrix: begin with pseudo-counts
    contingency_m = np.ones([divisions + 1] * k + [len(labels)], dtype='float64')
    for label, count in label_counts.items():
        contingency_m[..., label] *= xi * (count / min_count)
    
    # contingency-matrix: normal counts
    for c_index in data[list(indeces) + [-1]].T:
        contingency_m[tuple(c_index)] += 1
    
    results = {}
    for i, index in enumerate(indeces):
        #print(i, np.sum(contingency_m, axis=i))
        result = neg_H_cond(contingency_m) - neg_H_cond(np.sum(contingency_m, axis=i))
        results[index] = (result, indeces)
    return results


def record(results, records):
    '''
    results, records -> dicts
    Accepts output of the work-function and updates the dict that accumulates global results
    '''
    for index, score in results.items():
        if index not in records or score[0] > records[index][0]:
            records[index] = score


final_results = {}

for job in jobs_generator():
    results = work(job)
    print(results)
    record(results, final_results)

# result
print("Finished in", time()-time0, "sec.")
DataFrame(final_results).T.rename(columns={0: 'IG_max', 1: 'tuple'})

{0: (1.9994054532904126, (0, 1)), 1: (1.9994054532904126, (0, 1))}
Finished in 0.004632711410522461 sec.


Unnamed: 0,IG_max,tuple
0,1.99941,"(0, 1)"
1,1.99941,"(0, 1)"
