In [106]:
import numpy as np
import os
import pandas as pd
from collections import Counter

In [132]:
def load_data(fPath):
    df = pd.read_csv(fPath, header = None, skipinitialspace = True)
    # get classes to numpy array
    data = df.iloc[:,:].to_numpy().copy()
    
    return data

def count_values_from_frame(data):
    # get columns of frame
    columns = list(data)
    featureCounts = dict()
    # Count appearences of elements in all columns
    for c in columns:
        featureCounts[c] = df[c].value_counts()
    #print(featureCounts)
    return

def count_features_per_class(data, normalize = True):
    classes = np.unique(data[:,-1])
    # Create a list of dictionaries, one for each feature. Each dictionary contains an entry for each unique feature value(key)
    # and a list that contains the freq count for each class.
    counterDict = []#[{} for x in range(data.shape[1]-1)]
    #
    classStats = np.zeros(len(classes))
    for j, c in enumerate(classes):
        for f in range(data.shape[1]-1):           
            allKeys = np.unique(data[:,f])
            #print(allKeys)
            if len(counterDict) < f+1:
                counterDict.append(dict( (a, [0 for c in range(0, len(classes))]) for a in allKeys))
            
            keys, freqs = (np.unique(data[[data[:,-1] == c]][:,f], return_counts = True))
            for i,k in enumerate(keys):
                counterDict[f][k][j] = freqs[i] 
        # Compute per class frequencies, for comptueting info gain
        for f in range(data.shape[0]):
            if data[f,-1] == c:
                classStats[j] += 1
   
    return counterDict, classStats/data.shape[0]
    
def comp_feat_entropy(dataDicts, epsilon = 0.001, debug = False):
    """
        ARGUMENTS: dataDict (list of dicts): A list holding one dictionary per feature (attribute). Each discionary holds an entry with a key for each unique variable.
                                             Each variable's (key's) value is a list containing the accurencies each variable appears in each class. So the struct
                   epsilon (float):          float guarding against log(0) 
    """
    total = 0
    #print(dataDict[0])
    if type(dataDicts) is not list:
        dataDicts = [dataDicts]
    # Iterate over features
    featEntropy = []
    for i, feature in enumerate(dataDicts):
        if debug: print(feature)
        a = np.array([feature[k] for k in feature.keys()])
        total += np.sum(a)
        if debug: print(a, a.sum())
        # Compute the percantage of the population each value represents. i.e if a appears in half the samples then a_frac=0.5
        featFracs  = [np.sum(f)/total for f in a]
        # Compute the percentage that each value appears in each class. i.e if a is 25% in class 1 and 75% in class 2, then a_classFracs = [0.25, 0.75]
        classFracs = np.array([f/np.sum(f) for f in a])
        # Compute class entropy if wfor spliting on each appearring value. Epsilon guards against the log2(0) case
        classEntropy = np.array([-np.sum(((f[:]+epsilon) * np.log2(f[:]+epsilon))) for f in classFracs])
        # Compute resulting conditional class entropy for this feature H(S|a) by taking the expectation over all Entropy for all apeparing values.
        featEntropy.append(np.sum(featFracs * classEntropy))
        total = 0
        if debug: 
            print(featFracs)
            print(classFracs)
            print(classEntropy)
            print(featEntropy)
    return np.array(featEntropy)
        
def compute_feat_info_gain(dataDict, classDist, epsilon = 0.0001, debug = False):
    # Compute all conditional Entropies for all features
    featEntropy = comp_feat_entropy(dataDict)    
    # Compute aggregate class entropy before any featue split
    classEntropy = -np.sum((classDist[:]) * np.log2(classDist[:]))
    # Compute Information gain I(S;A) for all attributes A
    infoGain = classEntropy - featEntropy
    if debug: print(infoGain)
    return infoGain
    
def compute_attribure_entropy():
    a =0 


In [133]:
fPath = "B:/Workspaces/courses/CS580/assignments-1-2-3-testData/assign-2-testData/2-data-1.txt"#"dummy2.txt"
data = load_data(fPath)
print(data.shape)
print(data)
print(data[data[:,-1] == 'Yes'])

(20, 6)
[['young' 'FALSE' 'FALSE' 'fair' 'x' 'No']
 ['young' 'FALSE' 'FALSE' 'good' 'x' 'Mix']
 ['young' 'TRUE' 'SURE' 'good' 'x' 'Yes']
 ['young' 'TRUE' 'TRUE' 'fair' 'y' 'Yes']
 ['young' 'Maybe' 'FALSE' 'fair' 'x' 'No']
 ['middle' 'FALSE' 'FALSE' 'fair' 'x' 'Mix']
 ['middle' 'FALSE' 'FALSE' 'good' 'y' 'No']
 ['middle' 'TRUE' 'TRUE' 'good' 'z' 'Yes']
 ['middle' 'FALSE' 'SURE' 'excellent' 'y' 'Yes']
 ['middle' 'FALSE' 'TRUE' 'excellent' 'y' 'Mix']
 ['old' 'Maybe' 'TRUE' 'excellent' 'y' 'Yes']
 ['old' 'FALSE' 'TRUE' 'good' 'x' 'Mix']
 ['old' 'TRUE' 'FALSE' 'good' 'z' 'Yes']
 ['old' 'TRUE' 'FALSE' 'excellent' 'x' 'Mix']
 ['old' 'Maybe' 'FALSE' 'fair' 'y' 'No']
 ['teenager' 'TRUE' 'TRUE' 'good' 'x' 'Mix']
 ['teenager' 'FALSE' 'SURE' 'good' 'y' 'No']
 ['teenager' 'TRUE' 'TRUE' 'excellent' 'z' 'Yes']
 ['teenager' 'Maybe' 'TRUE' 'excellent' 'x' 'Mix']
 ['teenager' 'TRUE' 'FALSE' 'excellent' 'z' 'Yes']]
[['young' 'TRUE' 'SURE' 'good' 'x' 'Yes']
 ['young' 'TRUE' 'TRUE' 'fair' 'y' 'Yes']
 ['mid

In [135]:
dataDict, classDist = count_features_per_class(data)
#print(dataDict, classDist)

  keys, freqs = (np.unique(data[[data[:,-1] == c]][:,f], return_counts = True))


In [160]:
# Proof of concept debug
featEntropy = comp_feat_entropy(dataDict[2], debug = True)
print("Resulting Entropy is: ", featEntropy)

{'FALSE': [3, 4, 2], 'SURE': [0, 1, 2], 'TRUE': [4, 0, 4]}
[[3 4 2]
 [0 1 2]
 [4 0 4]] 20
[0.45, 0.15, 0.4]
[[0.33333333 0.44444444 0.22222222]
 [0.         0.33333333 0.66666667]
 [0.5        0.         0.5       ]]
[1.53108276 0.92754291 1.00907751]
[1.231749682433986]
Resulting Entropy is:  [1.23174968]


In [161]:
featureInfoGain = compute_feat_info_gain(dataDict, classDist)
print(featureInfoGain)
order = np.argsort(featureInfoGain)
print(order, featureInfoGain)
print("Most Informative feature: ", order[-1], " with Info gain: ", featureInfoGain[order[-1]])
print("Complete order:",*["Feature {}, Info Gain: {}".format(i, featureInfoGain[i]) for i in order[::-1]],sep = '\n')

[0.03631326 0.36774426 0.32712217 0.24314639 0.4960771 ]
[0 3 2 1 4] [0.03631326 0.36774426 0.32712217 0.24314639 0.4960771 ]
Most Informative feature:  4  with Info gain:  0.4960770969973616
Complete order:
Feature 4, Info Gain: 0.4960770969973616
Feature 1, Info Gain: 0.36774426292708107
Feature 2, Info Gain: 0.3271221660113741
Feature 3, Info Gain: 0.2431463938840257
Feature 0, Info Gain: 0.03631325887309744


In [162]:
def result_to_str(featureInfoGain, order, sId = 'nagada2'):
    ''' DESCRIPTION: Format return string
    '''
    retStr = ''.join(('(', sId,'\n', *['Feature: {}, Info Gain: {}\n'.format(i, featureInfoGain[i]) for i in order[::-1]], ')'))
    return retStr

def demo_result_to_str(featureInfoGain, order, sId = 'nagada2'):
    ''' DESCRIPTION: Format return string
    '''
    retStr = ''.join(('(', sId,'\n', '(IG ', str(featureInfoGain [0]),')\n)'))
    return retStr

def write_string_to_file(s, wFile = 'results2.txt'):
    ''' DESCRIPTION: Write string to target file.
    '''
    try:
        with open(wFile, 'w') as wf:
            wf.write(s)
    except Exception as e:
        print(e)

In [164]:
sId = '54'
retStr = demo_result_to_str(featureInfoGain, order, sId)
saveFile = '_'.join((sId, '2.txt'))
write_string_to_file(retStr, wFile = saveFile)
print("Complete order:", retStr, sep='\n')

Complete order:
(54
(IG 0.03631325887309744)
)
