In [2]:
import numpy as np
import os
import pandas as pd
from collections import Counter

In [96]:
def load_data(fPath):
    df = pd.read_csv(fPath, header = None, skipinitialspace = True)
    # get classes to numpy array
    data = df.iloc[:,:].to_numpy().copy()
    
    return data

def count_values_from_frame(data):
    # get columns of frame
    columns = list(data)
    featureCounts = dict()
    # Count appearences of elements in all columns
    for c in columns:
        featureCounts[c] = df[c].value_counts()
    #print(featureCounts)
    return

def count_features_per_class(data, normalize = True):
    classes = np.unique(data[:,-1])
    # Create a list of dictionaries, one for each feature. Each dictionary contains an entry for each unique feature value(key)
    # and a list that contains the freq count for each class.
    counterDict = []#[{} for x in range(data.shape[1]-1)]
    #
    classStats = np.zeros(len(classes))
    for j, c in enumerate(classes):
        for f in range(data.shape[1]-1):           
            allKeys = np.unique(data[:,f])
            #print(allKeys)
            if len(counterDict) < f+1:
                counterDict.append(dict( (a, [0 for c in range(0, len(classes))]) for a in allKeys))
            
            keys, freqs = (np.unique(data[[data[:,-1] == c]][:,f], return_counts = True))
            for i,k in enumerate(keys):
                counterDict[f][k][j] = freqs[i] 
        # Compute per class frequencies, for comptueting info gain
        for f in range(data.shape[0]):
            if data[f,-1] == c:
                classStats[j] += 1
   
    return counterDict, classStats/data.shape[0]
    
def comp_feat_entropy(dataDicts, epsilon = 0.001, debug = False):
    """
        ARGUMENTS: dataDict (list of dicts): A list holding one dictionary per feature (attribute). Each discionary holds an entry with a key for each unique variable.
                                             Each variable's (key's) value is a list containing the accurencies each variable appears in each class. So the struct
                   epsilon (float):          float guarding against log(0) 
    """
    total = 0
    #print(dataDict[0])
    # Iterate over features
    featEntropy = []
    for i, feature in enumerate(dataDicts):
        if debug: print(feature)
        a = np.array([feature[k] for k in feature.keys()])
        total += np.sum(a)
        if debug: print(a, a.sum())
        # Compute the percantage of the population each value represents. i.e if a appears in half the samples then a_frac=0.5
        featFracs  = [np.sum(f)/total for f in a]
        # Compute the percentage that each value appears in each class. i.e if a is 25% in class 1 and 75% in class 2, then a_classFracs = [0.25, 0.75]
        classFracs = np.array([f/np.sum(f) for f in a])
        # Compute class entropy if wfor spliting on each appearring value. Epsilon guards against the log2(0) case
        classEntropy = np.array([-np.sum(((f[:]+epsilon) * np.log2(f[:]+epsilon))) for f in classFracs])
        # Compute resulting conditional class entropy for this feature H(S|a) by taking the expectation over all Entropy for all apeparing values.
        featEntropy.append(np.sum(featFracs * classEntropy))
        total = 0
        if debug: 
            print(featFracs)
            print(classFracs)
            print(classEntropy)
            print("\n", featEntropy)
    return np.array(featEntropy)
        
def compute_feat_info_gain(dataDict, classDist, epsilon = 0.0001, debug = False):
    # Compute all conditional Entropies for all features
    featEntropy = comp_feat_entropy(dataDict)    
    # Compute aggregate class entropy before any featue split
    classEntropy = -np.sum((classDist[:]) * np.log2(classDist[:]))
    # Compute Information gain I(S;A) for all attributes A
    infoGain = classEntropy - featEntropy
    if debug: print(infoGain)
    return infoGain
    
def compute_attribure_entropy():
    a =0 


In [64]:
fPath = "dummy2.txt"
data = load_data(fPath)
print(data.shape)
print(data)
print(data[data[:,-1] == 'Yes'])

(6, 5)
[['a' 'b' 'c' 'd' 'No']
 ['c' 'f' 'g' 'd' 'No']
 ['a' 'b' 'c' 'd' 'Yes']
 ['z' 'w' 'e' 'e' 'No']
 ['a' 'c' 'e' 'f' 'No']
 ['v' 'b' 'n' 'f' 'Yes']]
[['a' 'b' 'c' 'd' 'Yes']
 ['v' 'b' 'n' 'f' 'Yes']]


In [65]:
dataDict, classDist = count_features_per_class(data)
print(dataDict, classDist)

[{'a': [2, 1], 'c': [1, 0], 'v': [0, 1], 'z': [1, 0]}, {'b': [1, 2], 'c': [1, 0], 'f': [1, 0], 'w': [1, 0]}, {'c': [1, 1], 'e': [2, 0], 'g': [1, 0], 'n': [0, 1]}, {'d': [2, 1], 'e': [1, 0], 'f': [1, 1]}] [0.66666667 0.33333333]


  keys, freqs = (np.unique(data[[data[:,-1] == c]][:,f], return_counts = True))


In [94]:
# Proof of concept debug
featEntropy = comp_feat_entropy(dataDict)
print(featEntropy)

[0.46304975 0.46304975 0.33871882 0.7932462 ]


In [89]:
featureInfoGain = compute_feat_info_gain(dataDict, classDist)
print(featureInfoGain)
order = np.argsort(featureInfoGain)
print(order, featureInfoGain)
print("Most Informative feature: ", order[-1], " with Info gain: ", featureInfoGain[order[-1]])
print("Complete order:",*["Feature {}, Info Gain: {}".format(i, featureInfoGain[i]) for i in order[::-1]],sep = '\n')

[0.45524609 0.45524609 0.57957701 0.12504963]
[3 0 1 2] [0.45524609 0.45524609 0.57957701 0.12504963]
Most Informative feature:  2  with Info gain:  0.5795770131468309
Complete order:
Feature 2, Info Gain: 0.5795770131468309
Feature 1, Info Gain: 0.45524608718033877
Feature 0, Info Gain: 0.45524608718033877
Feature 3, Info Gain: 0.12504963440926198


In [82]:
def result_to_str(featureInfoGain, order):
    ''' DESCRIPTION: Format return string
    '''
    retStr = ''.join((['Feature: {}, Info Gain: {}\n'.format(i, featureInfoGain[i]) for i in order[::-1]]))
    return retStr

def write_string_to_file(s, wFile = 'results3.txt'):
    ''' DESCRIPTION: Write string to target file.
    '''
    try:
        with open(wFile, 'w') as wf:
            wf.write(s)
    except Exception as e:
        print(e)

In [93]:
retStr = result_to_str(featureInfoGain, order)
print("Complete order:", retStr, sep='\n')

Complete order:
Feature: 2, Info Gain: 0.5795770131468309
Feature: 1, Info Gain: 0.45524608718033877
Feature: 0, Info Gain: 0.45524608718033877
Feature: 3, Info Gain: 0.12504963440926198

