# Project 1

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from IPython.display import display

SPLIT_RATIO = 0.8  # holdout ratio (according to Pareto principle)
ITERATIONS = 10  # iterations for unsupervised NB
EPSILON = 10**(-6)

DATASET1 = '2018S1-proj1_data/breast-cancer-dos.csv'
DATASET2 = '2018S1-proj1_data/car-dos.csv'
DATASET3 = '2018S1-proj1_data/hypothyroid-dos.csv'
DATASET4 = '2018S1-proj1_data/mushroom-dos.csv'
DATASETS = [DATASET1, DATASET2, DATASET3, DATASET4]
SAMPLE = '2018S1-proj1_data/sample.csv'  # example dataset

## Preprocess Supervised

In [2]:
'''
Helper function to create dictionary key given a string
@param lst = list of string to combined
@return dictionary key for probability (i.e. 0|mild|flu = (0==mild given flu))
'''
def createKey(lst):
    return '|'.join(str(i) for i in lst)

In [3]:
'''
Random generator
@param length = length of the random array
@return array containing random number that sums to 1
'''
def randDistGen(length):
    dist = np.random.random(length)
    return dist / dist.sum()

In [4]:
'''
Preprocessing for supervised to split the data into a training test split
@param data = dataset
@param flag = True = no split, False = split
'''
def preprocessSup(data, flag=False):
    dataFrame = pd.read_csv(data, header = None)
    
    if (flag == False):
        # Split according to the splitting ratio
        split = np.random.rand(len(dataFrame)) < SPLIT_RATIO
    
        train = dataFrame[split]
        test = dataFrame[~split]
    else:
        train= dataFrame
        test= dataFrame

    return train, test

## Train Supervised

In [5]:
'''
Create supervised Naive Bayes model by returning prior and posterior probability
@param trainSet = data that are used for training to generate model
@return priobProb, posteriorProb = probability counter
'''
def trainSup(trainSet):
    priorCounts = trainSet.iloc[:,-1].value_counts()
    priorProb = priorCounts / trainSet.shape[0]
    attribCount = trainSet.shape[1]
    posteriorProb = {}

    # Iterating over all columns except for the class column
    for attrib in range(attribCount - 1):  
        # Generate list of unique attribute values and disregard ?
        attribValues = list(trainSet[attrib].unique())
        if ('?' in attribValues): attribValues.remove('?')

        # Calculate posterior probabilities
        for c in priorCounts.index:
            for val in attribValues:
                # first filter by class then by attribute value
                filterClass = trainSet[trainSet.iloc[:,-1] == c]
                filterClassVal = filterClass[filterClass[attrib] == val]
                
                # Generate key for dictionary (0|severe|flu means (column 0=severe|flu)
                key = createKey([attrib, val, c])
                posteriorProb[key] = filterClassVal.shape[0] / priorCounts[c]
        
    # Iterate every element in dictionary and perform epsilon smoothing
    for key, value in posteriorProb.items():
        if (value == 0):
            posteriorProb[key] = EPSILON
    
    return priorProb, posteriorProb

## Predict Supervised

In [6]:
'''
Generate prediction for the testSet
@param testSet = test data that will be classified
@param priorProb, posteriorProb = model
@return predictedClasses = array containing prediction made by model
'''
def predictSup(testSet, priorProb, posteriorProb):
    cleanTest = testSet.drop(testSet.columns[-1], axis=1)
    predictedClasses = []

    for i, instance in cleanTest.iterrows():
        currentMax = ['null', -float("inf")]  # track most probable class

        for c in priorProb.index:
            # maximum likelihood estimation of each instance
            prob = np.log(priorProb[c])

            for attrib, val in enumerate(list(instance)):
                key = createKey([attrib, val, c])
                if key in posteriorProb: prob += np.log(posteriorProb[key])

            if prob > currentMax[1]: currentMax = [c, prob]

        # predicted class = most likely class
        predictedClasses.append(currentMax[0])

    return predictedClasses

## Evaluate Supervised

In [7]:
'''
Simple accuracy measure of the supervised context
@param testSet = array of test result
@param predictedClasses = array of predicted result
@return accuract = (TP+TN) / (TP+TN+FP+FN)
'''
def evaluateSup(testSet, predictedClasses):
    correct = 0
    trueClass = testSet.iloc[:,-1].tolist()

    if len(trueClass) != len(predictedClasses):
        print('Error: Class length')
        return

    for i in range(len(trueClass)):
        if (trueClass[i] == predictedClasses[i]): correct += 1

    return correct / len(trueClass)

In [8]:
'''
Create confusion matrix for supervised and unsupervised
@param trueClass = true class result array
@param predictedClasses = prediction classes array
@param classes = possible unique classes
@return confusionMatrix = confusion matrix
'''
def createConfusionMatrix(trueClass, predictedClasses, classes):
    if len(trueClass) != len(predictedClasses):
        print('Error: Class length')
        return

    # Create a pandas dataframe actual is the row, predicted is the column
    confusionMatrix = pd.DataFrame()
    for c in classes: confusionMatrix[c] = [0] * len(classes)

    confusionMatrix.index = classes  # index by classes

    # Calculate the confusion matrix
    for i in range(len(trueClass)):
        confusionMatrix.loc[trueClass[i], predictedClasses[i]] += 1

    # Add actual and predicted labels
    predictedCol = []
    actualRow = []

    for string in classes:
        predictedCol.append(string + ' (Predicted)')
        actualRow.append(string + ' (Actual)')

    confusionMatrix.columns = predictedCol
    confusionMatrix.index = actualRow

    return(confusionMatrix)

## Preprocess Unsupervised

In [9]:
'''
Preprocessing for unsupervised no split
@param data = dataset
@return dataFrame = pandas dataFrame consisting of data from the csv file
@return unsupervisedDataFrame = pandas dataFrame with class eliminated and probability added
'''
def preprocessUnsup(data):
    dataFrame = pd.read_csv(data, header = None)
    unsupervisedDataFrame = initialiseUnsup(dataFrame)
    
    return (dataFrame, unsupervisedDataFrame)

In [10]:
'''
Initialise the dataset with random distribution
@param dataset = dataframe of the dataset
@return unsupervisedDataset = dataset that we have added random distribution
'''
def initialiseUnsup(dataset):
        rowNumber = dataset.shape[0]
        classColumn = list(dataset[dataset.shape[1] - 1].unique())
        classNumber = len(classColumn)
        unsupervisedDataset = dataset.drop(dataset.shape[1] - 1, axis=1) # drop the last column
        
        # sample randomly
        sampleMatrix = np.zeros((rowNumber, classNumber))
        for i in range(rowNumber):
            samples = randDistGen(classNumber)
            sampleMatrix[i] = samples
        
        # Add a column to the dataset according to random distribution (initialisation phase)
        rowInstance = unsupervisedDataset.shape[0]
        for c in classColumn:
            unsupervisedDataset[c] = [0 for i in range(rowInstance)]
        
        matrixCounter = 0 
        # Iterate through the matrix and assign to the dataframe
        for index, row in unsupervisedDataset.iterrows():
            unsupervisedDataset.loc[index, -classNumber:] = sampleMatrix[matrixCounter]
            matrixCounter += 1
        
        return(unsupervisedDataset)

## Train Unsupervised

In [11]:
'''
This function should build an unsupervised NB model and return a dictionary of prior and posterior probability
@param classColumn = possible class name (weak unsupervised model)
@param dataset = data that are used to create the unsupervised NB classifier (format after running initialiseUnsup function)
@return priorProb = dictionary describing prior probability of the class in training data
@return posteriorProb = dictionary of dictionaries describing posterior probability
'''
def trainUnsup(classColumn, dataset):
    classCount = len(classColumn)
    attribColumn = list(range(dataset.shape[1] - classCount))
    
    # Create a dictionary of prior probability
    priorProb = {}
    for c in classColumn:
        priorProb[c] = dataset[c].sum()
    
    # make prior to a probability
    total_sum = sum(priorProb.values())
    for c in classColumn:
        priorProb[c] /= total_sum
    
    # Create posterior
    posteriorProb = {}
    
    # Setup the dictionary component
    for col in attribColumn:
        posteriorProb[col] = {}
        for c in classColumn:
            posteriorProb[col][c] = {}
            for uc in dataset[col].unique():
                posteriorProb[col][c][uc] = 0
    
    # Now use the training data to perform count calculation
    for index, row in dataset.iterrows():
        for col in attribColumn:
            for c in classColumn:
                posteriorProb[col][c][row[col]] += row[c]
    
    # After we finish the count calculation, perform probability calculation
    # Now calculate the posterior probability
    for col in attribColumn:
        for c in classColumn:
            sumInstance = sum(posteriorProb[col][c].values())
            for uc in dataset[col].unique():
                posteriorProb[col][c][uc] /= sumInstance
                
                # Perform epsilon smoothing
                if (posteriorProb[col][c][uc] == 0):
                    posteriorProb[col][c][uc] = EPSILON
    
    # Return the dictionary of prior probability and posterior probability
    return (priorProb, posteriorProb)

## Predict Unsupervised

In [12]:
'''
This function should predict the class for a set of instances, based on a trained model 
@param classColumn = possible class name (weak unsupervised model)
@param dataset = data that are used to calculate prediction
@param priorProb = dictionary of probability counter
@param posteriorProb = dictionary of probability counter
@return testClass = the class predicted by the naive bayes classifier. The predict class will change the structure of dataset to be used for the next iteration.
'''
def predictUnsup(classColumn, dataset, priorProb, posteriorProb):
    classCount = len(classColumn)
    testClass = [] # used to capture test result
    attribColumn = list(range(dataset.shape[1] - classCount))
    
    # Get the answer for every test instance
    for index, row in dataset.iterrows():
        # Initiate dictionary capturing the values calculated by naive bayes model
        testValue = {}
        for c in classColumn:
            testValue[c] = 0

        # Calculate for each class using the naive bayes model (log model for multiplication)
        for c in classColumn:
            testValue[c] = np.log(priorProb[c])
            for col in attribColumn:
                testValue[c] += np.log(posteriorProb[col][c][row[col]])
            
        # After calculating all of the possible class, we want to choose the maximum
        maximumClass = classColumn[0]
        maximumValue = testValue[maximumClass]
        for key, value in testValue.items():
            if (value > maximumValue):
                maximumValue = value
                maximumClass = key
    
        # Append result to be returned
        testClass.append(maximumClass)
        
        # Change the dataset structure for the instance to prepare for the next iteration
        # First take the exponent of that to get the real probability calculation value
        for c in classColumn:
            testValue[c] = np.exp(testValue[c])
        
        # Calculate the new probability
        denominatorNew = sum(testValue.values())
        
        for c in classColumn:
            dataset.loc[index, c] = testValue[c] / denominatorNew
    
    # Return the classifier for the class
    return testClass

## Evaluate Unsupervised

In [13]:
'''
Used to calculate the accuracy of the unsupervised model
@param confusionMatrix = confusion matrix of the unsupervised result
@result accuracy of the unsupervised taking into account class swapping
'''
def evaluateUnsup(confusionMatrix):
    maxSum = 0
    totalSum = confusionMatrix.values.sum()
    
    # Calculate sum of the highest of each column
    for c in confusionMatrix.columns: maxSum += confusionMatrix[c].max()
        
    return (maxSum/totalSum)

## Main Function

In [14]:
'''
Used mainly in holdout method to average 10 holdout
@param func = function that will be run
@param desc = description of experiment
@param flag = if true not split else split (default = split)
@param flagPrint = true print, false otherwise
'''
def sampleExperiment(func, desc, flag, flagPrint):
    RUNS = 10
    print(desc)  # description of experiment

    for d in DATASETS:
        avgMeasure = 0
        for i in range(RUNS): avgMeasure += func(d, flag, flagPrint)
        print('{} | Avg. Measure: {}'.format(d, avgMeasure / RUNS))

In [16]:
'''
Main function for supervised to be run across a dataset
@param data = dataset used to run
@param flag = if true not split else split (default = split)
@param flagPrint = true print, false otherwise
@return accuracy = accuracy of the data
'''
def mainSup(data, flag=False, flagPrint=True):
    # If true (don't split), false split
    trainSet, testSet = preprocessSup(data, flag)
    
    priorProb, posteriorProb = trainSup(trainSet)
    predictedClasses = predictSup(testSet, priorProb, posteriorProb)
    accuracy = evaluateSup(testSet, predictedClasses)
    confusionMatrix = createConfusionMatrix(testSet.iloc[:,-1].tolist(), predictedClasses, testSet.iloc[:, -1].unique())
    
    if (flagPrint):
        display(confusionMatrix)
        print("\nThe accuracy for the dataset is {}.\n\n".format(accuracy))
    
    # Return accuracy
    return accuracy

# Run the sampleExperiment and mainSup
sampleExperiment(mainSup, "Try the holdout", False, True)

Try the holdout


Unnamed: 0,recurrence-events (Predicted),no-recurrence-events (Predicted)
recurrence-events (Actual),12,8
no-recurrence-events (Actual),8,30



The accuracy for the dataset is 0.7241379310344828.


Unnamed: 0,no-recurrence-events (Predicted),recurrence-events (Predicted)
no-recurrence-events (Actual),38,7
recurrence-events (Actual),9,3



The accuracy for the dataset is 0.7192982456140351.


Unnamed: 0,no-recurrence-events (Predicted),recurrence-events (Predicted)
no-recurrence-events (Actual),31,3
recurrence-events (Actual),7,4



The accuracy for the dataset is 0.7777777777777778.


Unnamed: 0,no-recurrence-events (Predicted),recurrence-events (Predicted)
no-recurrence-events (Actual),39,5
recurrence-events (Actual),11,9



The accuracy for the dataset is 0.75.


Unnamed: 0,no-recurrence-events (Predicted),recurrence-events (Predicted)
no-recurrence-events (Actual),34,7
recurrence-events (Actual),8,5



The accuracy for the dataset is 0.7222222222222222.


Unnamed: 0,no-recurrence-events (Predicted),recurrence-events (Predicted)
no-recurrence-events (Actual),31,10
recurrence-events (Actual),9,8



The accuracy for the dataset is 0.6724137931034483.


Unnamed: 0,no-recurrence-events (Predicted),recurrence-events (Predicted)
no-recurrence-events (Actual),44,6
recurrence-events (Actual),15,8



The accuracy for the dataset is 0.7123287671232876.


Unnamed: 0,recurrence-events (Predicted),no-recurrence-events (Predicted)
recurrence-events (Actual),8,14
no-recurrence-events (Actual),4,28



The accuracy for the dataset is 0.6666666666666666.


Unnamed: 0,recurrence-events (Predicted),no-recurrence-events (Predicted)
recurrence-events (Actual),7,19
no-recurrence-events (Actual),6,37



The accuracy for the dataset is 0.6376811594202898.


Unnamed: 0,recurrence-events (Predicted),no-recurrence-events (Predicted)
recurrence-events (Actual),9,7
no-recurrence-events (Actual),8,35



The accuracy for the dataset is 0.7457627118644068.
2018S1-proj1_data/breast-cancer-dos.csv | Avg. Measure: 0.7128289274826617


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),234,12,0,1
acc (Actual),26,55,0,4
vgood (Actual),0,9,6,0
good (Actual),0,10,1,4



The accuracy for the dataset is 0.8259668508287292.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),224,5,0,1
acc (Actual),31,54,0,1
vgood (Actual),0,5,10,0
good (Actual),0,8,0,6



The accuracy for the dataset is 0.8521739130434782.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),228,12,0,0
acc (Actual),21,59,0,1
vgood (Actual),0,5,5,0
good (Actual),0,13,0,3



The accuracy for the dataset is 0.8501440922190202.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),217,6,0,0
acc (Actual),27,53,0,1
vgood (Actual),0,9,7,0
good (Actual),0,11,0,2



The accuracy for the dataset is 0.8378378378378378.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),233,13,0,1
acc (Actual),15,50,0,3
vgood (Actual),0,5,8,1
good (Actual),0,10,0,5



The accuracy for the dataset is 0.8604651162790697.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),234,8,0,1
acc (Actual),29,47,0,2
vgood (Actual),0,5,4,0
good (Actual),0,11,2,4



The accuracy for the dataset is 0.8328530259365994.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),204,11,0,0
acc (Actual),21,54,0,2
vgood (Actual),0,11,5,0
good (Actual),0,8,0,4



The accuracy for the dataset is 0.834375.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),240,8,0,1
acc (Actual),11,60,0,1
vgood (Actual),0,5,4,0
good (Actual),0,14,1,3



The accuracy for the dataset is 0.882183908045977.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),223,7,0,0
acc (Actual),20,61,1,0
vgood (Actual),0,5,4,0
good (Actual),0,6,0,3



The accuracy for the dataset is 0.8818181818181818.


Unnamed: 0,unacc (Predicted),acc (Predicted),vgood (Predicted),good (Predicted)
unacc (Actual),224,9,0,0
acc (Actual),29,49,0,2
vgood (Actual),0,8,9,0
good (Actual),0,8,0,3



The accuracy for the dataset is 0.8357771260997068.
2018S1-proj1_data/car-dos.csv | Avg. Measure: 0.8493595052108599


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,37
negative (Actual),0,599



The accuracy for the dataset is 0.9418238993710691.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,32
negative (Actual),0,571



The accuracy for the dataset is 0.9469320066334992.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,39
negative (Actual),0,584



The accuracy for the dataset is 0.9373996789727127.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,30
negative (Actual),0,601



The accuracy for the dataset is 0.9524564183835182.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,39
negative (Actual),0,605



The accuracy for the dataset is 0.9394409937888198.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,28
negative (Actual),0,604



The accuracy for the dataset is 0.9556962025316456.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,38
negative (Actual),0,567



The accuracy for the dataset is 0.9371900826446281.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,37
negative (Actual),0,608



The accuracy for the dataset is 0.9426356589147287.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,31
negative (Actual),0,616



The accuracy for the dataset is 0.9520865533230294.


Unnamed: 0,hypothyroid (Predicted),negative (Predicted)
hypothyroid (Actual),0,27
negative (Actual),0,586



The accuracy for the dataset is 0.9559543230016313.
2018S1-proj1_data/hypothyroid-dos.csv | Avg. Measure: 0.9461615817565281


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),835,4
p (Actual),8,751



The accuracy for the dataset is 0.9924906132665833.


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),738,10
e (Actual),7,871



The accuracy for the dataset is 0.9895448954489545.


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),823,4
p (Actual),14,728



The accuracy for the dataset is 0.988527724665392.


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),855,3
p (Actual),10,763



The accuracy for the dataset is 0.9920294297976702.


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),843,10
p (Actual),9,779



The accuracy for the dataset is 0.9884216940889702.


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),805,1
p (Actual),15,761



The accuracy for the dataset is 0.9898862199747156.


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),858,4
p (Actual),12,767



The accuracy for the dataset is 0.9902498476538696.


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),838,1
p (Actual),4,790



The accuracy for the dataset is 0.9969381506429884.


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),903,0
p (Actual),7,744



The accuracy for the dataset is 0.9957678355501813.


Unnamed: 0,e (Predicted),p (Predicted)
e (Actual),867,7
p (Actual),11,738



The accuracy for the dataset is 0.988909426987061.
2018S1-proj1_data/mushroom-dos.csv | Avg. Measure: 0.9912765838076384


In [17]:
'''
Used to answer question 3, using holdout and no holdout
'''
def mainQuestion3():
    # Using holdout
    sampleExperiment(mainSup, "Using holdout, averaged over 10 runs", False, False)
    
    print("\n")
    
    # Using no holdout
    sampleExperiment(mainSup, "Training in test data", True, False)

mainQuestion3()

Using holdout, averaged over 10 runs
2018S1-proj1_data/breast-cancer-dos.csv | Avg. Measure: 0.7017455261163469
2018S1-proj1_data/car-dos.csv | Avg. Measure: 0.8509284569597065
2018S1-proj1_data/hypothyroid-dos.csv | Avg. Measure: 0.9549697123143657
2018S1-proj1_data/mushroom-dos.csv | Avg. Measure: 0.9905829663916904


Training in test data
2018S1-proj1_data/breast-cancer-dos.csv | Avg. Measure: 0.7552447552447551
2018S1-proj1_data/car-dos.csv | Avg. Measure: 0.8738425925925926
2018S1-proj1_data/hypothyroid-dos.csv | Avg. Measure: 0.9522605121719886
2018S1-proj1_data/mushroom-dos.csv | Avg. Measure: 0.991999015263417


In [23]:
'''
Calculates how far away probabilistic estimate of true class is from 1.
Assumes probabilistic estimate of true class = highest probability of all classes due to class 'swapping'
@param df = dataframe containing the data with the probability columns
@param predict = prediction (highest value probability, see our assumption above)
@return average = delta average
'''
def deltaQuestion6(df, predict):
    deltaSum = 0

    # difference (probability) between each predicted class and 1
    for i, row in df.iterrows(): deltaSum += abs(1 - row[predict[i]])

    return deltaSum / df.shape[0]

In [29]:
'''
Execute unsupervised, will print the confusion matrix
@param data = number of data
@param iteration = number of iteration
@return accuracy = accuracy based on evaluateUnsup
'''
def mainUnsup(data, iteration):
    '''
    execute unsupervised NB across 'data'
    '''
    datas = preprocessUnsup(data)
    df = datas[0]
    unsupervisedDataFrame = datas[1]
    trueClass = df.iloc[:,-1].tolist()  # extract true classes
    
    # Iterate iteration number of times changing the unsupervisedDataFrame
    for i in range(iteration):
        print("Iteration {}".format(i+1))
        prior, posterior = trainUnsup(list(set(trueClass)), unsupervisedDataFrame)
        oldUnsupervisedDf = unsupervisedDataFrame.copy(deep=True)
        predictedClasses = predictUnsup(list(set(trueClass)), unsupervisedDataFrame, prior, posterior)
        confusionMatrix = createConfusionMatrix(trueClass, predictedClasses, list(set(trueClass)))
        accuracyUnsup = evaluateUnsup(confusionMatrix)
        display(confusionMatrix)
        print("The accuracy is {}.".format(accuracyUnsup))
        
        # Delta
        deltaAverage = deltaQuestion6(oldUnsupervisedDf, predictedClasses)
        print("The delta average is {}".format(deltaAverage))
        print("\n\n")
        
        
    return accuracyUnsup

# Call the function
# for data in DATASETS:
acc = mainUnsup(DATASET4, ITERATIONS)

Iteration 1


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),2925,991
e (Actual),212,3996


The accuracy is 0.8519202363367799.
The delta average is 0.48891636387714826



Iteration 2


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),247,3961


The accuracy is 0.869153126538651.
The delta average is 0.4636855356661047



Iteration 3


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),259,3949


The accuracy is 0.8676760216642049.
The delta average is 0.2829090248292559



Iteration 4


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),258,3950


The accuracy is 0.8677991137370753.
The delta average is 0.018887619137860905



Iteration 5


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),256,3952


The accuracy is 0.8680452978828164.
The delta average is 0.0009313754870716503



Iteration 6


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),256,3952


The accuracy is 0.8680452978828164.
The delta average is 7.802287897999256e-05



Iteration 7


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),256,3952


The accuracy is 0.8680452978828164.
The delta average is 5.689489822847807e-06



Iteration 8


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),256,3952


The accuracy is 0.8680452978828164.
The delta average is 6.808677195276554e-07



Iteration 9


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),256,3952


The accuracy is 0.8680452978828164.
The delta average is 8.17713743951293e-08



Iteration 10


Unnamed: 0,p (Predicted),e (Predicted)
p (Actual),3100,816
e (Actual),256,3952


The accuracy is 0.8680452978828164.
The delta average is 9.81002583757774e-09



