# Project 1

In [69]:
import numpy as np
import pandas as pd
from collections import defaultdict
from IPython.display import display

SPLIT_RATIO = 0.8  # holdout ratio (according to Pareto principle)
ITERATIONS = 10  # iterations for unsupervised NB
EPSILON = 10**(-6)

DATASET1 = '2018S1-proj1_data/breast-cancer-dos.csv'
DATASET2 = '2018S1-proj1_data/car-dos.csv'
DATASET3 = '2018S1-proj1_data/hypothyroid-dos.csv'
DATASET4 = '2018S1-proj1_data/mushroom-dos.csv'
DATASETS = [DATASET1, DATASET2, DATASET3, DATASET4]
SAMPLE = '2018S1-proj1_data/sample.csv'  # example dataset

## Preprocess Supervised

In [18]:
'''
Helper function to create dictionary key given a string
@param lst = list of string to combined
@return dictionary key for probability (i.e. 0|mild|flu = (0==mild given flu))
'''
def createKey(lst):
    return '|'.join(str(i) for i in lst)

In [19]:
'''
Random generator
@param length = length of the random array
@return array containing random number that sums to 1
'''
def randDistGen(length):
    dist = np.random.random(length)
    return dist / dist.sum()

In [34]:
'''
Preprocessing for supervised to split the data into a training test split
@param data = dataset
@param flag = True = no split, False = split
'''
def preprocessSup(data, flag=False):
    dataFrame = pd.read_csv(data, header = None)
    
    if (flag == False):
        # Split according to the splitting ratio
        split = np.random.rand(len(dataFrame)) < SPLIT_RATIO
    
        train = dataFrame[split]
        test = dataFrame[~split]
    else:
        train= dataFrame
        test= dataFrame

    return train, test

## Train Supervised

In [55]:
'''
Create supervised Naive Bayes model by returning prior and posterior probability
@param trainSet = data that are used for training to generate model
@return priobProb, posteriorProb = probability counter
'''
def trainSup(trainSet):
    priorCounts = trainSet.iloc[:,-1].value_counts()
    priorProb = priorCounts / trainSet.shape[0]
    attribCount = trainSet.shape[1]
    posteriorProb = {}

    # Iterating over all columns except for the class column
    for attrib in range(attribCount - 1):  
        # Generate list of unique attribute values and disregard ?
        attribValues = list(trainSet[attrib].unique())
        if ('?' in attribValues): attribValues.remove('?')

        # Calculate posterior probabilities
        for c in priorCounts.index:
            for val in attribValues:
                # first filter by class then by attribute value
                filterClass = trainSet[trainSet.iloc[:,-1] == c]
                filterClassVal = filterClass[filterClass[attrib] == val]
                
                # Generate key for dictionary (0|severe|flu means (column 0=severe|flu)
                key = createKey([attrib, val, c])
                posteriorProb[key] = filterClassVal.shape[0] / priorCounts[c]
        
    # Iterate every element in dictionary and perform epsilon smoothing
    for key, value in posteriorProb.items():
        if (value == 0):
            posteriorProb[key] = EPSILON
    
    return priorProb, posteriorProb
# (train, test) = preprocessSup(SAMPLE, True)
# (prior, posterior) = trainSup(train)
# print(prior)
# print(posterior)

## Predict Supervised

In [73]:
'''
Generate prediction for the testSet
@param testSet = test data that will be classified
@param priorProb, posteriorProb = model
@return predictedClasses = array containing prediction made by model
'''
def predictSup(testSet, priorProb, posteriorProb):
    cleanTest = testSet.drop(testSet.columns[-1], axis=1)
    predictedClasses = []

    for i, instance in cleanTest.iterrows():
        currentMax = ['null', -float("inf")]  # track most probable class

        for c in priorProb.index:
            # maximum likelihood estimation of each instance
            prob = np.log(priorProb[c])

            for attrib, val in enumerate(list(instance)):
                key = createKey([attrib, val, c])
                if key in posteriorProb: prob += np.log(posteriorProb[key])

            if prob > currentMax[1]: currentMax = [c, prob]

        # predicted class = most likely class
        predictedClasses.append(currentMax[0])

    return predictedClasses
# (train, test) = preprocessSup(SAMPLE, True)
# (prior, posterior) = trainSup(train)
# print(predictSup(test, prior, posterior))
# print(test)
# print(train)
# print(prior)
# print(posterior)

['Flu', 'Cold', 'Flu', 'Cold', 'Flu']
        0       1       2    3     4
0  severe    mild    high  yes   Flu
1      no  severe  normal  yes  Cold
2    mild    mild  normal  yes   Flu
3    mild      no  normal   no  Cold
4  severe  severe  normal  yes   Flu
        0       1       2    3     4
0  severe    mild    high  yes   Flu
1      no  severe  normal  yes  Cold
2    mild    mild  normal  yes   Flu
3    mild      no  normal   no  Cold
4  severe  severe  normal  yes   Flu
Flu     0.6
Cold    0.4
Name: 4, dtype: float64
{'0|severe|Flu': 0.66666666666666663, '0|no|Flu': 1e-06, '0|mild|Flu': 0.33333333333333331, '0|severe|Cold': 1e-06, '0|no|Cold': 0.5, '0|mild|Cold': 0.5, '1|mild|Flu': 0.66666666666666663, '1|severe|Flu': 0.33333333333333331, '1|no|Flu': 1e-06, '1|mild|Cold': 1e-06, '1|severe|Cold': 0.5, '1|no|Cold': 0.5, '2|high|Flu': 0.33333333333333331, '2|normal|Flu': 0.66666666666666663, '2|high|Cold': 1e-06, '2|normal|Cold': 1.0, '3|yes|Flu': 1.0, '3|no|Flu': 1e-06, '3|yes|Col

## Evaluate Supervised

In [24]:
'''
Simple accuracy measure of the supervised context
@param testSet = array of test result
@param predictedClasses = array of predicted result
@return accuract = (TP+TN) / (TP+TN+FP+FN)
'''
def evaluateSup(testSet, predictedClasses):
    correct = 0
    trueClass = testSet.iloc[:,-1].tolist()

    if len(trueClass) != len(predictedClasses):
        print('Error: Class length')
        return

    for i in range(len(trueClass)):
        if (trueClass[i] == predictedClasses[i]): correct += 1

    return correct / len(trueClass)

In [29]:
'''
Create confusion matrix for supervised
@param trueClass = true class result array
@param predictedClasses = prediction classes array
@param classes = possible unique classes
@return confusionMatrix = confusion matrix
'''
def createConfusionMatrixSup(trueClass, predictedClasses, classes):
    '''
    builds a confusion matrix for unsupervised evaluation
    '''
    if len(trueClass) != len(predictedClasses):
        print('Error: Class length')
        return

    # Create a pandas dataframe actual is the row, predicted is the column
    confusionMatrix = pd.DataFrame()
    for c in classes: confusionMatrix[c] = [0] * len(classes)

    confusionMatrix.index = classes  # index by classes

    # Calculate the confusion matrix
    for i in range(len(trueClass)):
        confusionMatrix.loc[trueClass[i], predictedClasses[i]] += 1

    # Add actual and predicted labels
    predictedCol = []
    actualRow = []

    for string in classes:
        predictedCol.append(string + ' (Predicted)')
        actualRow.append(string + ' (Actual)')

    confusionMatrix.columns = predictedCol
    confusionMatrix.index = actualRow

    return(confusionMatrix)

## Train Unsupervised

In [None]:
'''
Initialise instances with random (non-uniform) class distributions
'''
def trainUnsup(df):
    classList = set(df.iloc[:,-1])  # extract unique classes
    classCount = len(classList)
    cleanTrain = df.drop(df.columns[-1], axis=1)  # drop class col
    N = cleanTrain.shape[0]  # instance count
    priorCounts = float()  #initialise prior
    randoms = []

    # initialise class probabilities as float
    for c in classList: cleanTrain[c] = float()

    # generate N random probability distributions, while summing for prior
    # store generated probabilities in dataframe
    for i in range(N):
        randoms.append(randDistGen(classCount))
        priorCounts += randoms[i]

        for idx, c in enumerate(classList):
            cleanTrain.at[i, c] = randoms[i][idx]

    # slide example
    # randoms2 = [[ 0.4,  0.6],
    #             [ 0.7,  0.3],
    #             [ 0.9,  0.1],
    #             [ 0.2,  0.8],
    #             [ 0.6,  0.4]]
    # randoms = randoms2
    # priorCounts= np.array([2.8, 2.2])

    # print('priorCounts', priorCounts)
    # print('priorProb', priorCounts / N)
    # print('INIT\n', cleanTrain)


    return cleanTrain, classList, priorCounts

In [None]:
def predictUnsup(cleanTrain, classes, priorCounts, trueClass):
    '''
    returns predicted classes and final class distributions
    '''
    N = cleanTrain.shape[0]  # instance count
    attribCount = cleanTrain.shape[1] - len(classes)
    priorProb = priorCounts / N

    for j in range(ITERATIONS):

        predictedClasses = []
        # extract final predictions (most likely class)
        for i, instance in cleanTrain.iterrows():
            currentMax = ['null', 0]

            for idx, c in enumerate(classes):
                if instance[c] > currentMax[1]: currentMax = [c, instance[c]]
            predictedClasses.append(currentMax[0])

        evaluateUnsup(trueClass, predictedClasses, classes, True)

        posteriorProb = defaultdict(lambda: 0)

        # generate attribute value|class pair probabilities
        for attrib in range(attribCount) :
            attribValues = list(cleanTrain[attrib].unique())
            if ('?' in attribValues): attribValues.remove('?')

            for idx, c in enumerate(classes):
                for val in attribValues:
                    key = createKey([attrib, val, c])
                    filterClassVal = cleanTrain[cleanTrain[attrib] == val]
                    posteriorProb[key] += filterClassVal[c].sum() / priorCounts[idx]

        # maximum likelihood estimation of each instance
        for i, instance in cleanTrain.iterrows():
            classSum = 0.0

            for idx, c in enumerate(classes):
                tmpProb = priorProb[idx]

                for attrib, val in enumerate(list(instance)):
                    key = createKey([attrib, val, c])
                    if key in posteriorProb: tmpProb *= posteriorProb[key]
                classSum += tmpProb
                cleanTrain.at[i, c] = tmpProb

            # normalise posterior
            for c in classes: cleanTrain.at[i, c] /= classSum

        # recalculate prior
        for idx,c in enumerate(classes): priorCounts[idx] = cleanTrain[c].sum()
        priorProb = priorCounts / N

    return predictedClasses, cleanTrain

In [None]:
def evaluateUnsup(trueClass, predictedClasses, classes, flag):
    '''
    builds a confusion matrix for unsupervised evaluation
    '''
    if len(trueClass) != len(predictedClasses):
        print('Error: Class length')
        return

    # Create a pandas dataframe actual is the row, predicted is the column
    confusionMatrix = pd.DataFrame()
    for c in classes: confusionMatrix[c] = [0] * len(classes)

    confusionMatrix.index = classes  # index by classes

    # Calculate the confusion matrix
    for i in range(len(trueClass)):
        confusionMatrix.loc[trueClass[i], predictedClasses[i]] += 1

    # Add actual and predicted labels
    predictedCol = []
    actualRow = []

    for string in classes:
        predictedCol.append(string + ' (Predicted)')
        actualRow.append(string + ' (Actual)')

    confusionMatrix.columns = predictedCol
    confusionMatrix.index = actualRow

    if flag: print(confusionMatrix)

    # calculate unsupervised accuracy
    maxSum = 0
    totalSum = confusionMatrix.values.sum()
    # sum rows or columns???
    for c in confusionMatrix.columns: maxSum += confusionMatrix[c].max()

    return maxSum / totalSum

## Main Function

In [89]:
'''
Used mainly in holdout method to average 10 holdout
@param func = function that will be run
@param desc = description of experiment
@param flag = if true not split else split (default = split)
@param flag_print = true print, false otherwise
'''
def sample_experiment(func, desc, flag, flag_print):
    RUNS = 10
    print(desc)  # description of experiment

    for d in DATASETS:
        avgMeasure = 0
        for i in range(RUNS): avgMeasure += func(d, flag, flag_print)
        print('{} | Avg. Measure: {}'.format(d, avgMeasure / RUNS))

In [94]:
'''
Main function for supervised to be run across a dataset
@param data = dataset used to run
@param flag = if true not split else split (default = split)
@param flag_print = true print, false otherwise
@return accuracy = accuracy of the data
'''
def mainSup(data, flag=False, flag_print=True):
    # If true (don't split), false split
    trainSet, testSet = preprocessSup(data, flag)
    
    priorProb, posteriorProb = trainSup(trainSet)
    predictedClasses = predictSup(testSet, priorProb, posteriorProb)
    accuracy = evaluateSup(testSet, predictedClasses)
    confusion_matrix = createConfusionMatrixSup(testSet.iloc[:,-1].tolist(), predictedClasses, testSet.iloc[:, -1].unique())
    
    if (flag_print):
        display(confusion_matrix)
        print("\nThe accuracy for the dataset is {}.".format(accuracy))
    
    # Return accuracy
    return accuracy
# accuracyTester = mainSup(DATASET4, True)
# sample_experiment(mainSup, "Try the holdout", False, False)
# mainSup(DATASET4, True, False)

0.991999015263417

In [95]:
'''
Used to answer question 3, using holdout and no holdout
'''
def mainQuestion3():
    # Using holdout
    sample_experiment(mainSup, "Using holdout, averaged over 10 runs", False, False)
    
    print("\n")
    
    # Using no holdout
    sample_experiment(mainSup, "Training in test data", True, False)

# mainQuestion3()

Using holdout, averaged over 10 runs
2018S1-proj1_data/breast-cancer-dos.csv | Avg. Measure: 0.7527521941965094
2018S1-proj1_data/car-dos.csv | Avg. Measure: 0.8519416177177661
2018S1-proj1_data/hypothyroid-dos.csv | Avg. Measure: 0.9521784112150685
2018S1-proj1_data/mushroom-dos.csv | Avg. Measure: 0.9913746397451153


Training in test data
2018S1-proj1_data/breast-cancer-dos.csv | Avg. Measure: 0.7552447552447551
2018S1-proj1_data/car-dos.csv | Avg. Measure: 0.8738425925925926
2018S1-proj1_data/hypothyroid-dos.csv | Avg. Measure: 0.9522605121719886
2018S1-proj1_data/mushroom-dos.csv | Avg. Measure: 0.991999015263417


In [None]:
def deltaQuestion6(df, predict):
    '''
    Calculates how far away probabilistic estimate of true class is from 1.
    Assumes probabilistic estimate of true class = highest probability of all classes due to class 'swapping'
    '''
    deltaSum = 0

    # difference (probability) between each predicted class and 1
    for i, row in df.iterrows(): deltaSum += abs(1 - row[predict[i]])

    return deltaSum / df.shape[0]

In [None]:
def mainUnsup(data):
    '''
    execute unsupervised NB across 'data'
    '''
    df = pd.read_csv(data, header = None)
    trueClass = df.iloc[:,-1].tolist()  # extract true classes
    cleanTrain, classes, priorCounts = trainUnsup(df)
    predictedClasses, finalDf = predictUnsup(cleanTrain, classes, priorCounts, trueClass)
    accuracyUnsup = evaluateUnsup(trueClass, predictedClasses, classes, True)
    deltaAvg = deltaQuestion6(finalDf, predictedClasses)

    print('delta average', deltaAvg)

    return accuracyUnsup