# Project 1: What is labelled data worth to Naive Bayes?
---

## Initialisation

In [1]:
# Library
import pandas as pd
import numpy as np
import random
from IPython.display import display

In [2]:
# Data Path Constant
BREAST_CANCER = "2018S1-proj1_data/breast-cancer-dos.csv"
CAR = "2018S1-proj1_data/car-dos.csv"
HYPOTHYROID = "2018S1-proj1_data/hypothyroid-dos.csv"
MUSHROOM = "2018S1-proj1_data/mushroom-dos.csv"

# Column name for each data set
BREAST_CANCER_COLUMN = ["age", "menopause", "tumor-size", "inv-nodes", "node-caps", "deg-malig", "breast", "breast-quad", "irradiat", "class"]
CAR_COLUMN = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
HYPOTHYROID_COLUMN = ["sex", "on_thyroxine", "query_on_thyroxine", "on_antithyroid_medication", "thyroid_surgery", "query_hypothyroid", "query_hyperthyroid", "pregnant", "sick", "tumor", "lithium", "goitre", "TSH_measured", "T3_measured", "TT4_measured", "T4U_measured", "FTI_measured", "TBG_measured", "class"]
MUSHROOM_COLUMN = ["cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat", "class"]

# Other Constant
PRIOR_INDEX = 0
POSTERIOR_INDEX = 1
EPSILON = 0.000001 # Epsilon smoothing
ITERATION = 10 # Number of iteration in unsupervised naive bayes

## Preprocess

In [3]:
'''
This function should open a data file in csv, and transform it into a usable format 
@param data = csv data that will be opened
@param columns = new column name for header
@param eliminate = eliminate the missing/ ? instances (recommended if there are only few missing instances)
@return df = clean pandas dataframe object
'''
def preprocess(data, columns, eliminate=True):
    # Read and add a header to the data frame
    df = pd.read_csv(data, header=None)
    df.columns = columns
    
    # If the parameter ignore is set to be false then we don't ignore
    if (eliminate):
        # Iterate through the dataframe and only append without missing value
        # Capture the index of one with the missing values
        for index, row in df.iterrows():
            for att in row:
                # If encounter missing values in the data, don't use that
                if (att == "?"):
                    df.drop(index, inplace=True)
                    break
    
    # Return the clean data
    return df

## Train Supervised

In [4]:
'''
This function should build a supervised NB model and return a dictionary count
@param train_data = training data that are used to create the supervised NB classifier
@param class_label = column name of the class that we want to classify (class) in this case
@return count_prior = dictionary describing prior count of the class in training data
@return count_posterior = dictionary of dictionaries posterior count
'''
def train_count_supervised(train_data, class_label):
    # Calculate prior (dictionary_prior)
    # Initiate python dictionary with the number of class in the training data as it's key
    count_prior = {}
    for unique_class in train_data[class_label].unique():
        count_prior[unique_class] = 0
    
    # Loop through the training data and get how many for every classes instance.
    # Now we have the count prior class that are used for prediction
    for index, row in train_data.iterrows():
        count_prior[row[class_label]] += 1
    
    # Calculate count posterior (dictionary_posterior), the data structure used are dictionary
    # of dictionary of dictionaries
    count_posterior = {}
    
    # Setup the dictionary component (initialise)
    column_name = list(train_data.columns)
    column_name.remove(class_label)
    for col in column_name:
        count_posterior[col] = {}
        for unique_class in train_data[class_label].unique():
            count_posterior[col][unique_class] = {}
            for unique_col in train_data[col].unique():
                count_posterior[col][unique_class][unique_col] = 0
    
    # Now use the training data to perform count calculation
    for index, row in train_data.iterrows():
        for col in column_name:
            count_posterior[col][row[class_label]][row[col]] += 1
            
    return((count_prior, count_posterior))

In [5]:
'''
This function should build supervised NB model and return a dictionary of probability
@param train_data = training data that are used to create the supervised NB classifier
@param class_label = column name of the class that we want to classify
@return probability_prior = dictionary describing prior probability of the class in training data,
@return probability_posterior = dictionary of dictionaries posterior probability
'''
def train_probability_supervised(train_data, class_label):
    (count_prior, count_posterior) = train_count_supervised(train_data, class_label)
    
    # Now calculate the probability of each instances, (i.e. 'Cough': {'flu': {'yes': 3, 'no': 0}, 'cold': {'yes': 1, 'no': 1}}
    # will have P(cough = yes | flu) = 3/3, P(cough = no | flu) = 0/3 and P(cough = yes | cold) = 1/2, P(cough = no | cold) = 1/2
    # First calculate the prior probability of the class P(c)
    probability_prior = {}
    sum_instance = sum(count_prior.values())
    for unique_class in train_data[class_label].unique():
        probability_prior[unique_class] = count_prior[unique_class] / sum_instance
        
        # Perform epsilon smoothing
        if (count_prior[unique_class] == 0):
            probability_prior[unique_class] = EPSILON
    
    # Calculate the posterior probability
    probability_posterior = count_posterior
    column_name = list(train_data.columns)
    column_name.remove(class_label)
                
    # Now calculate the posterior probability
    for col in column_name:
        for unique_class in train_data[class_label].unique():
            sum_instance = sum(probability_posterior[col][unique_class].values())
            for unique_col in train_data[col].unique():
                probability_posterior[col][unique_class][unique_col] /= sum_instance
                
                # Perform epsilon smoothing
                if (probability_posterior[col][unique_class][unique_col] == 0):
                    probability_posterior[col][unique_class][unique_col] = EPSILON
                
            
    return((probability_prior, probability_posterior))

## Predict Supervised

In [6]:
'''
This function should predict the class for a set of instances, based on a trained model 
@param test_data = data to be tested
@param train_data = data used for setup such as finding all possible classes
@param class_label = attribute that we want to classify using naive bayes
@param model = tuple consisting probability_prior and probability_posterior. 
Mainly use the train_probability_supervised instead of train_count_supervised
@return test_class = array containing the class predicted by the naive bayes classifier
'''
def predict_supervised(test_data, train_data, class_label, model):
    prior_probability = model[PRIOR_INDEX]
    posterior_probability = model[POSTERIOR_INDEX]
    test_class = [] # used to capture test result
    
    # Used for calculation purposes
    column_name = list(train_data.columns)
    column_name.remove(class_label)
    
    # Get the answer for every test instance
    for index, row in test_data.iterrows():
        # Initiate dictionary capturing the values calculated by naive bayes model
        test_value = {}
        for unique_class in train_data[class_label].unique():
            test_value[unique_class] = 0

        # Calculate for each class using the naive bayes model (log model for multiplication)
        for unique_class in train_data[class_label].unique():
            test_value[unique_class] = np.log(prior_probability[unique_class])
            for col in column_name:
                test_value[unique_class] += np.log(posterior_probability[col][unique_class][row[col]])
            
        # After calculating all of the possible class, we want to choose the maximum
        maximum_class = (train_data[class_label].unique())[0]
        maximum_value = test_value[maximum_class]
        for key, value in test_value.items():
            if (value > maximum_value):
                maximum_value = value
                maximum_class = key
    
        # Append result
        test_class.append(maximum_class)
    
    # Return the classifier for the class
    return test_class

# Evaluate Supervised

In [7]:
'''
This function should evaluate a set of predictions, in a supervised context.
@param true_test_result = array class actual
@param predicted_test_result = array class predicted
@return accuracy = (TP+TN) / (TP+TN+FP+FN)
'''
def evaluate_supervised(true_test_result, predicted_test_result):
    if (len(true_test_result) != len(predicted_test_result)):
        print("Error, different length.")
    else:
        # Measure accuracy
        correct = 0
        for i in range(len(true_test_result)):
            if (true_test_result[i] == predicted_test_result[i]):
                correct += 1
            
        accuracy = correct / len(true_test_result)
        
    return accuracy

In [8]:
'''
Create confusion matrix based on the actual and predicted class
@param true_test_result = array class actual
@param predicted_test_result = array class predicted
@param class_column = all possible classes in the dataset
@return confusion_df = confusion matrix (dataframe)
'''
def confusion_matrix_supervised(true_test_result, predicted_test_result, class_column):
    if (len(true_test_result) != len(predicted_test_result)):
        print("Error, different length.")
    else:
        # Create a pandas dataframe actual is the row, predicted is the column
        confusion_df = pd.DataFrame()
        
        for unique_class in class_column:
            confusion_df[unique_class] = [0 for i in range(len(class_column))]
        
        # Change index for df
        confusion_df.index = class_column
        
        # Calculate the confusion matrix
        for i in range(len(true_test_result)):
            confusion_df.loc[true_test_result[i], predicted_test_result[i]] += 1
            
        # Add actual and predicted description on the table to make it easier to see
        predicted_column = []
        for string in confusion_df.columns:
            string += " predicted"
            predicted_column.append(string.title())
       
        actual_row = []
        for string in class_column:
            string += " actual"
            actual_row.append(string.title())
        
        confusion_df.columns = predicted_column
        confusion_df.index = actual_row
        
        return confusion_df

## Main Program

In [9]:
# Using the breast cancer data
df_breast_cancer = preprocess(BREAST_CANCER, BREAST_CANCER_COLUMN, eliminate=True)
model_main = train_probability_supervised(df_breast_cancer, "class")
predicted_test_result = predict_supervised(df_breast_cancer, df_breast_cancer, "class", model_main)
confusion_df = confusion_matrix_supervised(list(df_breast_cancer["class"]), predicted_test_result, df_breast_cancer["class"].unique())
display(confusion_df)
print("The accuracy for breast cancer dataset is {}.".format(evaluate_supervised(list(df_breast_cancer["class"]), predicted_test_result)))

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,48,33
No-Recurrence-Events Actual,31,165


The accuracy for breast cancer dataset is 0.7689530685920578.


In [10]:
# Using car data
df_car = preprocess(CAR, CAR_COLUMN, eliminate=True)
model_main = train_probability_supervised(df_car, "class")
predicted_test_result = predict_supervised(df_car, df_car, "class", model_main)
confusion_df = confusion_matrix_supervised(list(df_car["class"]), predicted_test_result, df_car["class"].unique())
display(confusion_df)
print("The accuracy for car dataset is {}.".format(evaluate_supervised(list(df_car["class"]), predicted_test_result)))

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1161,47,0,2
Acc Actual,85,289,0,10
Vgood Actual,0,26,39,0
Good Actual,0,46,2,21


The accuracy for car dataset is 0.8738425925925926.


In [11]:
# Using the hypothyroid data
df_hypo = preprocess(HYPOTHYROID, HYPOTHYROID_COLUMN, eliminate=True)
model_main = train_probability_supervised(df_hypo, "class")
predicted_test_result = predict_supervised(df_hypo, df_hypo, "class", model_main)
confusion_df = confusion_matrix_supervised(list(df_hypo["class"]), predicted_test_result, df_hypo["class"].unique())
display(confusion_df)
print("The accuracy for hypothyroid dataset is {}.".format(evaluate_supervised(list(df_hypo["class"]), predicted_test_result)))

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,0,149
Negative Actual,0,2941


The accuracy for hypothyroid dataset is 0.9517799352750809.


In [12]:
# Using the mushroom data
df_mushroom = preprocess(MUSHROOM, MUSHROOM_COLUMN, eliminate=True)
model_main = train_probability_supervised(df_mushroom, "class")
predicted_test_result = predict_supervised(df_mushroom, df_mushroom, "class", model_main)
confusion_df = confusion_matrix_supervised(list(df_mushroom["class"]), predicted_test_result, df_mushroom["class"].unique())
display(confusion_df)
print("The accuracy for mushroom dataset is {}.".format(evaluate_supervised(list(df_mushroom["class"]), predicted_test_result)))

Unnamed: 0,P Predicted,E Predicted
P Actual,2156,0
E Actual,16,3472


The accuracy for mushroom dataset is 0.997165131112686.


## Train Unsupervised

In [13]:
'''
Initialise the dataset with random distribution
@param dataset = dataframe of the dataset
@param class_label = column name that we want to classify
@return unsupervised_dataset = dataset that we have added random distribution
'''
def initialise_unsupervised_naive_bayes(dataset, class_label):
        row_number = dataset.shape[0]
        class_number = len(dataset[class_label].unique())
        unsupervised_dataset = dataset.drop(["class"], axis=1)

        # sample from uniform distribution
        sample_matrix = np.zeros((row_number, class_number))
        for i in range(row_number):
            samples = np.random.uniform(0, 1, class_number)
            samples /= sum(samples)  # normalise so it sums to 1
            sample_matrix[i] = samples
        
        # Add a column to the dataset according to random distribution (initialisation phase)
        row_instance = unsupervised_dataset.shape[0]
        for unique_class in dataset[class_label].unique():
            unsupervised_dataset[unique_class] = [0 for i in range(row_instance)]
        
        matrix_counter = 0
        # Iterate through the matrix and assign to the dataframe
        for index, row in unsupervised_dataset.iterrows():
            unsupervised_dataset.loc[index, -class_number:] = sample_matrix[matrix_counter]
            matrix_counter += 1
        
        return(unsupervised_dataset)

In [14]:
'''
Initialise the dataset deterministically
@param dataset = dataframe of the dataset
@param class_label = column name that we want to classify
@return unsupervised_dataset = dataset that we have added random distribution
'''
def initialise_unsupervised_naive_bayes_deterministically(dataset, class_label):
        row_number = dataset.shape[0]
        class_column = dataset[class_label].unique()
        class_length = len(class_column)
        unsupervised_dataset = dataset.drop(["class"], axis=1)
        
        # Add a column to the dataset according to random distribution (initialisation phase)
        row_instance = unsupervised_dataset.shape[0]
        for unique_class in class_column:
            unsupervised_dataset[unique_class] = [0 for i in range(row_instance)]
        
        # Add deterministically
#         for index, row in unsupervised_dataset.iterrows():
#             random_number = random.randint(0, class_length-1)
#             unsupervised_dataset.set_value(index, class_column[random_number], 1)
    
        for index, row in unsupervised_dataset.iterrows():
            for unique_class in class_column:
                unsupervised_dataset.loc[index, unique_class] = 1/class_length
                
        return(unsupervised_dataset)

In [15]:
'''
This function should build an unsupervised NB model and return a count
@param class_column = possible class name (weak unsupervised model)
@param attribute_column = attributes that are used for calculation
@param dataset = data that are used to create the unsupervised NB classifier (format after running initialise_unsupervised_naive_bayes function)
@param class_label = column name of the class that we want to classify
@return count_prior = dictionary describing prior count of the class in training data
@return count_posterior = dictionary of dictionaries posterior count
'''
def train_count_unsupervised(class_column, attribute_column, dataset, class_label):
    # Calculate prior (dictionary_prior)
    # Initiate python dictionary with the number of class in the training data as it's key
    count_prior = {}
    for unique_class in class_column:
        count_prior[unique_class] = 0
    
    # Loop through the training data and sum the probability
    for index, row in dataset.iterrows():
        for unique_class in class_column:
            count_prior[unique_class] += row[unique_class]
    
    # Calculate count posterior (dictionary_posterior), the data structure used are dictionary
    # of dictionary of dictionaries
    count_posterior = {}
    
    # Setup the dictionary component
    for col in attribute_column:
        count_posterior[col] = {}
        for unique_class in class_column:
            count_posterior[col][unique_class] = {}
            for unique_col in dataset[col].unique():
                count_posterior[col][unique_class][unique_col] = 0
    
    # Now use the training data to perform calculation
    for index, row in dataset.iterrows():
        for col in attribute_column:
            for unique_class in class_column:
                count_posterior[col][unique_class][row[col]] += row[unique_class]
    print(count_posterior)
   
    return((count_prior, count_posterior))

In [16]:
'''
This function should build unsupervised NB model and return a probability
@param class_column = possible class name (weak unsupervised model)
@param attribute_column = attributes that are used for calculation
@param dataset = data that are used to create the unsupervised NB classifier (format after running initialise_unsupervised_naive_bayes function)
@param class_label = column name of the class that we want to classify
@return probability_prior = dictionary describing prior probability of the class in training data,
@return probability_posterior = dictionary of dictionaries posterior probability
'''
def train_probability_unsupervised(class_column, attribute_column, dataset, class_label):
    (count_prior, count_posterior) = train_count_unsupervised(class_column, attribute_column, dataset, class_label)
    
    # Now calculate the probability of each instances, (i.e. 'Cough': {'flu': {'yes': 0.3, 'no': 0}, 'cold': {'yes': 0.1, 'no': 0.1}}
    # will have P(cough = yes | flu) = 0.3/0.3, P(cough = no | flu) = 0/0.3 and P(cough = yes | cold) = 0.1/0.2, P(cough = no | cold) = 0.1/0.2
    
    # First calculate the prior probability of the class P(c)
    probability_prior = {}
    sum_instance = sum(count_prior.values())
    for unique_class in class_column:
        probability_prior[unique_class] = count_prior[unique_class] / sum_instance
        
        # Perform epsilon smoothing
        if (count_prior[unique_class] == 0.0):
            probability_prior[unique_class] = EPSILON
    
    # Calculate the posterior probability
    probability_posterior = count_posterior
                
    # Now calculate the posterior probability
    for col in attribute_column:
        for unique_class in class_column:
            sum_instance = sum(probability_posterior[col][unique_class].values())
            for unique_col in dataset[col].unique():
                probability_posterior[col][unique_class][unique_col] /= sum_instance
                
                # Perform epsilon smoothing
                if (probability_posterior[col][unique_class][unique_col] == 0):
                    probability_posterior[col][unique_class][unique_col] = EPSILON
    
    return((probability_prior, probability_posterior))

## Predict Unsupervised

In [17]:
'''
This function should predict the class for a set of instances, based on a trained model 
@param class_column = possible class name (weak unsupervised model)
@param attribute_column = attributes that are used for calculation
@param dataset = data that are used to calculate prediction
@param class_label = attribute that we want to classify using naive bayes
@param model = tuple consisting probability_prior and probability_posterior. 
@return test_class = the class predicted by the naive bayes classifier. The predict class will change the structure of dataset to be used for the next iteration.
'''
def predict_unsupervised(class_column, attribute_column, dataset, class_label, model):
    prior_probability = model[PRIOR_INDEX]
    posterior_probability = model[POSTERIOR_INDEX]
    test_class = [] # used to capture test result
    
    # Get the answer for every test instance
    for index, row in dataset.iterrows():
        # Initiate dictionary capturing the values calculated by naive bayes model
        test_value = {}
        for unique_class in class_column:
            test_value[unique_class] = 0

        # Calculate for each class using the naive bayes model (log model for multiplication)
        for unique_class in class_column:
            test_value[unique_class] = np.log(prior_probability[unique_class])
            for col in attribute_column:
                test_value[unique_class] += np.log(posterior_probability[col][unique_class][row[col]])
            
        # After calculating all of the possible class, we want to choose the maximum
        maximum_class = class_column[0]
        maximum_value = test_value[maximum_class]
        for key, value in test_value.items():
            if (value > maximum_value):
                maximum_value = value
                maximum_class = key
    
        # Append result
        test_class.append(maximum_class)
        # Change the dataset structure for the instance to prepare for the next iteration
        # First take the exponent of that to get the real probability calculation value
        for unique_class in class_column:
            test_value[unique_class] = np.exp(test_value[unique_class])
        
        # Calculate the new probability
        denominator_new = sum(test_value.values())
        
        for unique_class in class_column:
            dataset.loc[index, unique_class] = test_value[unique_class] / denominator_new
    
    # Return the classifier for the class
    return test_class

## Evaluate Unsupervised

In [18]:
'''
This function calculate the accuracy based on the confusion matrix that are given
@param confusion_matrix = the confusion matrix for unsupervised
@return accuracy = accuracy of the unsupervised
'''
def evaluate_unsupervised(confusion_matrix):
    total_instance = 0
    true_positive = 0
    columns = list(confusion_matrix.columns)
    rows = list(confusion_matrix.index)
    
    for col in columns:
        current_max = 0
        for row in rows:
            if (confusion_matrix.loc[row, col] > current_max):
                current_max = confusion_matrix.loc[row, col]
            total_instance += confusion_matrix.loc[row, col]
        
        true_positive += current_max
    
    return true_positive/total_instance

In [19]:
'''
This function create a confusion matrix for unsupervised
@param true_test_result = list displaying the real value of the test result
@param predicted_test_result = list displaying the prediction
@param class_column = all possible classes
'''
def confusion_matrix_unsupervised(true_test_result, predicted_test_result, class_column):
    if (len(true_test_result) != len(predicted_test_result)):
        print("Error, different length.")
    else:
        # Create a pandas dataframe actual is the row, predicted is the column
        confusion_df = pd.DataFrame()
        
        for unique_class in class_column:
            confusion_df[unique_class] = [0 for i in range(len(class_column))]
        
        # Change index for df
        confusion_df.index = class_column
        
        # Calculate the confusion matrix
        for i in range(len(true_test_result)):
            confusion_df.loc[true_test_result[i], predicted_test_result[i]] += 1
            
        # Add actual and predicted description on the table to make it easier to see
        predicted_column = []
        for string in confusion_df.columns:
            string += " predicted"
            predicted_column.append(string.title())
       
        actual_row = []
        for string in class_column:
            string += " actual"
            actual_row.append(string.title())
        
        confusion_df.columns = predicted_column
        confusion_df.index = actual_row
        
        return confusion_df

## Main Program

In [20]:
# Using the breast cancer dataset
print("Breast cancer dataset".title())
attribute_column = BREAST_CANCER_COLUMN
breast_df = preprocess(BREAST_CANCER, BREAST_CANCER_COLUMN)
print(breast_df["class"].value_counts())
unsupervised_df = initialise_unsupervised_naive_bayes_deterministically(breast_df, "class")
display(unsupervised_df)
for i in range(ITERATION):
    # Train and give prediction and calculate accuracy
    print("Iteration {}".format(i+1))
    model = train_probability_unsupervised(breast_df["class"].unique(), attribute_column[:-1], unsupervised_df, "class")
    predicted_test_result = predict_unsupervised(breast_df["class"].unique(), attribute_column[:-1], unsupervised_df, "class", model)
    confusion_matrix = confusion_matrix_unsupervised(list(breast_df["class"]), predicted_test_result, breast_df["class"].unique())
    display(confusion_matrix)
    display(unsupervised_df)
    print("The accuracy of breast cancer dataset based on confusion matrix is {}.".format(evaluate_unsupervised(confusion_matrix)))
    print("\n\n")

Breast Cancer Dataset
no-recurrence-events    196
recurrence-events        81
Name: class, dtype: int64


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


Iteration 1
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no-recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '2

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 2
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 3
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 4
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 5
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 6
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 7
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 8
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 9
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'no

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.



Iteration 10
{'age': {'recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}, 'no-recurrence-events': {'40-49': 44.5, '50-59': 45.5, '60-69': 27.5, '30-39': 18.0, '70-79': 2.5, '20-29': 0.5}}, 'menopause': {'recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}, 'no-recurrence-events': {'premeno': 74.5, 'ge40': 61.5, 'lt40': 2.5}}, 'tumor-size': {'recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}, 'no-recurrence-events': {'15-19': 14.5, '35-39': 9.5, '30-34': 28.5, '25-29': 25.5, '40-44': 11.0, '10-14': 14.0, '0-4': 4.0, '20-24': 24.0, '45-49': 1.5, '50-54': 4.0, '5-9': 2.0}}, 'inv-nodes': {'recurrence-events': {'0-2': 104.5, '3-5': 17.0, '15-17': 3.0, '6-8': 8.5, '9-11': 3.5, '24-26': 0.5, '12-14': 1.5}, 'n

Unnamed: 0,Recurrence-Events Predicted,No-Recurrence-Events Predicted
Recurrence-Events Actual,81,0
No-Recurrence-Events Actual,196,0


Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,recurrence-events,no-recurrence-events
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,0.5,0.5
1,50-59,ge40,15-19,0-2,no,1,right,central,no,0.5,0.5
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,0.5,0.5
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,0.5,0.5
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,0.5,0.5
5,50-59,premeno,25-29,3-5,no,2,right,left_up,yes,0.5,0.5
6,50-59,ge40,40-44,0-2,no,3,left,left_up,no,0.5,0.5
7,40-49,premeno,10-14,0-2,no,2,left,left_up,no,0.5,0.5
8,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0.5,0.5
9,40-49,ge40,40-44,15-17,yes,2,right,left_up,yes,0.5,0.5


The accuracy of breast cancer dataset based on confusion matrix is 0.7075812274368231.





In [21]:
# Using the car dataset
print("car dataset".title())
attribute_column = CAR_COLUMN
car_df = preprocess(CAR, CAR_COLUMN)
unsupervised_df = initialise_unsupervised_naive_bayes_deterministically(car_df, "class")
display(unsupervised_df)
for i in range(ITERATION):
    # Train and give prediction and calculate accuracy
    print("Iteration {}".format(i+1))
    model = train_probability_unsupervised(car_df["class"].unique(), attribute_column[:-1], unsupervised_df, "class")
    predicted_test_result = predict_unsupervised(car_df["class"].unique(), attribute_column[:-1], unsupervised_df, "class", model)
    confusion_matrix = confusion_matrix_unsupervised(list(car_df["class"]), predicted_test_result, car_df["class"].unique())
    display(confusion_matrix)
    display(unsupervised_df)
    print("The accuracy of car dataset based on confusion matrix is {}.".format(evaluate_unsupervised(confusion_matrix)))
    print("\n\n")

Car Dataset


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


Iteration 1
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'vgood': {'2': 144.0, '4': 144.0, 'more'

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 2
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 3
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 4
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 5
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 6
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 7
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 8
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 9
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': {

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.



Iteration 10
{'buying': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'maint': {'unacc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'acc': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'vgood': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}, 'good': {'vhigh': 108.0, 'high': 108.0, 'med': 108.0, 'low': 108.0}}, 'doors': {'unacc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'acc': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'vgood': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}, 'good': {'2': 108.0, '3': 108.0, '4': 108.0, '5more': 108.0}}, 'persons': {'unacc': {'2': 144.0, '4': 144.0, 'more': 144.0}, 'acc': 

Unnamed: 0,Unacc Predicted,Acc Predicted,Vgood Predicted,Good Predicted
Unacc Actual,1210,0,0,0
Acc Actual,384,0,0,0
Vgood Actual,65,0,0,0
Good Actual,69,0,0,0


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,unacc,acc,vgood,good
0,vhigh,vhigh,2,2,small,low,0.25,0.25,0.25,0.25
1,vhigh,vhigh,2,2,small,med,0.25,0.25,0.25,0.25
2,vhigh,vhigh,2,2,small,high,0.25,0.25,0.25,0.25
3,vhigh,vhigh,2,2,med,low,0.25,0.25,0.25,0.25
4,vhigh,vhigh,2,2,med,med,0.25,0.25,0.25,0.25
5,vhigh,vhigh,2,2,med,high,0.25,0.25,0.25,0.25
6,vhigh,vhigh,2,2,big,low,0.25,0.25,0.25,0.25
7,vhigh,vhigh,2,2,big,med,0.25,0.25,0.25,0.25
8,vhigh,vhigh,2,2,big,high,0.25,0.25,0.25,0.25
9,vhigh,vhigh,2,4,small,low,0.25,0.25,0.25,0.25


The accuracy of car dataset based on confusion matrix is 0.7002314814814815.





In [None]:
# Using the hypothyroid dataset
print("hypothyroid dataset".title())
attribute_column = HYPOTHYROID_COLUMN
hypo_df = preprocess(HYPOTHYROID, HYPOTHYROID_COLUMN)
unsupervised_df = initialise_unsupervised_naive_bayes_deterministically(hypo_df, "class")
for i in range(ITERATION):
    # Train and give prediction and calculate accuracy
    print("Iteration {}".format(i+1))
    model = train_probability_unsupervised(hypo_df["class"].unique(), attribute_column[:-1], unsupervised_df, "class")
    predicted_test_result = predict_unsupervised(hypo_df["class"].unique(), attribute_column[:-1], unsupervised_df, "class", model)
    confusion_matrix = confusion_matrix_unsupervised(list(hypo_df["class"]), predicted_test_result, hypo_df["class"].unique())
    display(confusion_matrix)
    print("The accuracy of hypothyroid dataset based on confusion matrix is {}.".format(evaluate_unsupervised(confusion_matrix)))
    print("\n\n")

Hypothyroid Dataset
Iteration 1
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor': {'hypothyroid': {'f': 1525.0, 't': 20.0}, 'negative': {'f': 1525.

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 2
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 3
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 4
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 5
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 6
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 7
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 8
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 9
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumor

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.



Iteration 10
{'sex': {'hypothyroid': {'M': 454.0, 'F': 1091.0}, 'negative': {'M': 454.0, 'F': 1091.0}}, 'on_thyroxine': {'hypothyroid': {'f': 1317.0, 't': 228.0}, 'negative': {'f': 1317.0, 't': 228.0}}, 'query_on_thyroxine': {'hypothyroid': {'f': 1517.5, 't': 27.5}, 'negative': {'f': 1517.5, 't': 27.5}}, 'on_antithyroid_medication': {'hypothyroid': {'f': 1524.0, 't': 21.0}, 'negative': {'f': 1524.0, 't': 21.0}}, 'thyroid_surgery': {'hypothyroid': {'f': 1494.0, 't': 51.0}, 'negative': {'f': 1494.0, 't': 51.0}}, 'query_hypothyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'query_hyperthyroid': {'hypothyroid': {'f': 1425.0, 't': 120.0}, 'negative': {'f': 1425.0, 't': 120.0}}, 'pregnant': {'hypothyroid': {'f': 1514.0, 't': 31.0}, 'negative': {'f': 1514.0, 't': 31.0}}, 'sick': {'hypothyroid': {'f': 1496.0, 't': 49.0}, 'negative': {'f': 1496.0, 't': 49.0}}, 'tumo

Unnamed: 0,Hypothyroid Predicted,Negative Predicted
Hypothyroid Actual,149,0
Negative Actual,2941,0


The accuracy of hypothyroid dataset based on confusion matrix is 0.9517799352750809.





In [None]:
# Using the mushroom dataset
print("mushroom dataset".title())
attribute_column = MUSHROOM_COLUMN
mushroom_df = preprocess(MUSHROOM, MUSHROOM_COLUMN)
unsupervised_df = initialise_unsupervised_naive_bayes(mushroom_df, "class")
for i in range(ITERATION):
    # Train and give prediction and calculate accuracy
    print("Iteration {}".format(i+1))
    model = train_probability_unsupervised(mushroom_df["class"].unique(), attribute_column[:-1], unsupervised_df, "class")
    predicted_test_result = predict_unsupervised(mushroom_df["class"].unique(), attribute_column[:-1], unsupervised_df, "class", model)
    confusion_matrix = confusion_matrix_unsupervised(list(mushroom_df["class"]), predicted_test_result, mushroom_df["class"].unique())
    display(confusion_matrix)
    print("The accuracy of mushroom dataset based on confusion matrix is {}.".format(evaluate_unsupervised(confusion_matrix)))
    print("\n\n")

Mushroom Dataset
Iteration 1
{'cap-shape': {'p': {'x': 1420.6155420843263, 'b': 150.4152262917357, 's': 16.609308393185845, 'f': 1194.3050599309106, 'k': 18.535754251839727, 'c': 2.06407334071164}, 'e': {'x': 1419.384457915674, 'b': 149.58477370826435, 's': 15.390691606814155, 'f': 1237.69494006909, 'k': 17.46424574816027, 'c': 1.93592665928836}}, 'cap-surface': {'p': {'s': 631.578738024158, 'y': 1100.1425642616489, 'f': 1069.0368400005023, 'g': 1.786822006399646}, 'e': {'s': 628.421261975842, 'y': 1119.8574357383511, 'f': 1090.9631599995, 'g': 2.213177993600354}}, 'cap-color': {'p': {'n': 583.1511895248801, 'y': 523.9432765941298, 'w': 434.7669598293011, 'g': 840.7978563794533, 'e': 289.813485089077, 'p': 47.13769812801269, 'b': 61.16359935001326, 'c': 21.770899397842026}, 'e': {'n': 580.84881047512, 'y': 532.0567234058701, 'w': 445.2330401706989, 'g': 855.2021436205471, 'e': 298.186514910923, 'p': 48.86230187198732, 'b': 58.83640064998674, 'c': 22.22910060215798}}, 'bruises': {'p': {

Unnamed: 0,P Predicted,E Predicted
P Actual,1528,628
E Actual,1089,2399


The accuracy of mushroom dataset based on confusion matrix is 0.6957831325301205.



Iteration 2
