<a href="https://colab.research.google.com/github/peng741521840/123/blob/main/31005.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
import numpy as np
import pandas as pd
import math, sys, os

In [68]:
from google.colab import drive
drive.mount('/content/drive')
#Authorization code: 4/1AX4XfWhn5wIrLQEzSZKjU8HrAnrw9TzZyKrc4VXnpwljVj-k9W-q8_Egjbs

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [69]:
data = pd.read_csv('drive/My Drive/Colab Notebooks/iris.data')

In [70]:
def dataProcess(source_fpath, target_fpath):
    with open(source_fpath) as source_f:
        sample_list = []
        for line in source_f:
            content = line.strip().split(",")
            sample_list.append(np.array(content))
        csvdf = pd.DataFrame(sample_list)
        csvdf.columns = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "Class"]
        csvdf.to_csv(target_fpath, index=0)

In [71]:
def informationEntropy(dataset):
    entropysum = 0
    category_list = list(dataset["Class"])
    for category in set(dataset["Class"]):
        pk = category_list.count(category) / len(dataset)
        entropysum += pk * math.log(pk, 2)
    return (-1) * entropysum

In [72]:
def informationDiscreteGain(dataset, attribute):
    entropy = informationEntropy(dataset)
    entropysum = 0
    attribute_value_list = list(dataset[attribute])
    for attribute_value in set(dataset[attribute]):
        weight = attribute_value_list.count(attribute_value) / len(dataset)
        entropysum += weight * informationEntropy(dataset[dataset[attribute] == attribute_value])
    return entropy - entropysum

In [73]:
def informationContinuousGain(dataset, attribute):
    entropy = informationEntropy(dataset)
    attribute_value_list = sorted(set(dataset[attribute]))
    if len(attribute_value_list) == 1:
        thresholds = [attribute_value_list[0]]
    else:
        thresholds = [(attribute_value_list[i] + attribute_value_list[i + 1]) / 2 for i in range(len(attribute_value_list) - 1)]
    
    threshold_entropysum_dict = {}
    for threshold in thresholds:
        lessthreshold = dataset[dataset[attribute] <= threshold]
        morethreshold = dataset[dataset[attribute] > threshold]
        lessweight = len(lessthreshold) / len(dataset)
        moreweight = len(morethreshold) / len(dataset)
        entropysum = lessweight * informationEntropy(lessthreshold) + moreweight * informationEntropy(morethreshold)
        threshold_entropysum_dict[threshold] = entropysum
        
    threshold_entropysum_sorted = sorted(threshold_entropysum_dict.items(), key=lambda item: item[1])
    minentropysum_threshold = threshold_entropysum_sorted[0][0]
    minentropysum = threshold_entropysum_sorted[0][1]
    return minentropysum_threshold, entropy - minentropysum

In [74]:
def maxNumOutcome(dataset):
    category_list = list(dataset["Class"])
    category_dict = {}
    for category in set(dataset["Class"]):
        category_dict[category] = category_list.count(category)
    category_sorted = sorted(category_dict.items(), key=lambda item: item[1], reverse=True)
    return category_sorted[0][0]

In [75]:
def treeNodeGenerate(dataset, attribute_list):
    if len(set(dataset["Class"])) == 1:
        node = list(set(dataset["Class"]))[0]
    elif len(attribute_list) == 0 or sum([len(set(dataset[attribute])) - 1 for attribute in attribute_list]) == 0:
        node = maxNumOutcome(dataset)
    else:
        attribute_gain_dict = {}
        for attribute in attribute_list:
            threshold, attribute_gain = informationContinuousGain(dataset, attribute)
            attribute_gain_dict[attribute] = threshold, attribute_gain
        attribute_gain_sorted = sorted(attribute_gain_dict.items(), key=lambda item: item[1][1], reverse=True)
        maxgain_attribute = attribute_gain_sorted[0][0]
        maxgain_threshold = attribute_gain_sorted[0][1][0]

        son_node_attribute_list = attribute_list.copy()
        son_node_attribute_list.remove(maxgain_attribute)

        left_node_dataset = dataset[dataset[maxgain_attribute] <= maxgain_threshold]
        if len(left_node_dataset) == 0:
            leftnode = maxNumOutcome(dataset)
        else:
            leftnode = treeNodeGenerate(left_node_dataset, son_node_attribute_list)
        
        right_node_dataset = dataset[dataset[maxgain_attribute] > maxgain_threshold]
        if len(right_node_dataset) == 0:
            rightnode = maxNumOutcome(dataset)
        else:
            rightnode = treeNodeGenerate(right_node_dataset, son_node_attribute_list)
        
        if leftnode == rightnode:
            node = leftnode
        else:
            node = {}
            node[(maxgain_attribute, maxgain_threshold)] = {"<=":leftnode, ">":rightnode}

    return node

In [76]:
def predictOne(tree_train_model, testdata):
    if type(tree_train_model) == str:
        predict_value = tree_train_model
    elif type(tree_train_model) == dict:
        key = list(tree_train_model)[0]
        if testdata[key[0]] <= key[1]:
            son_tree_train_model = tree_train_model[key]["<="]
        else:
            son_tree_train_model = tree_train_model[key][">"]
        predict_value = predictOne(son_tree_train_model, testdata)
    return predict_value

In [77]:
def predict(tree_train_model, testdataset):
    predict_list = []
    for i in range(len(testdataset)):
        predict_value = predictOne(tree_train_model, testdataset.loc[i])
        predict_list.append((testdataset.loc[i]["Class"], predict_value))
    return predict_list

In [78]:
def predictAccuracy(predict_list):
    predict_true_num = 0
    for bigram in predict_list:
        if bigram[0] == bigram[1]:
            predict_true_num += 1
    accuracy = predict_true_num / len(predict_list)
    return accuracy

In [79]:
def subdatasetPartitioning(dataset):
    index = [i for i in range(len(dataset))]
    np.random.seed(2)
    np.random.shuffle(index)

    traindatasetlen = int(len(dataset) * 0.8)
    traindataset = dataset.loc[index[:traindatasetlen]]
    testdataset = dataset.loc[index[traindatasetlen:]]

    return traindataset, testdataset

In [80]:
def datasetPartitioning(dataset):

    traindataset_list = []
    testdataset_list = []
    for i in range(3):
        subdataset = dataset.loc[i * 50 : (i + 1) * 50 - 1]
        subdataset = subdataset.reset_index()
        subtraindataset, subtestdataset = subdatasetPartitioning(subdataset)
        traindataset_list.append(subtraindataset)
        testdataset_list.append(subtestdataset)

    traindataset = pd.concat(traindataset_list, ignore_index=True)
    testdataset = pd.concat(testdataset_list, ignore_index=True)

    return traindataset, testdataset

In [81]:
if __name__ == "__main__":
    source_fpath = "iris.data"
    target_fpath = source_fpath.replace("data", "csv")
    dataProcess(source_fpath, target_fpath)

    dataset = pd.read_csv("iris.csv")

    traindataset, testdataset =  datasetPartitioning(dataset)

    attribute_list = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]
    tree_train_model = treeNodeGenerate(traindataset, attribute_list)
    print("The Dict of Trained Model:")
    print(tree_train_model, "\n")

    predict_list = predict(tree_train_model, testdataset)
    print("The List of Predicting Outcomes (Actual Label, Predicted Value) :")
    print(predict_list, "\n")

    print("The Accuracy of Model Prediction: ", predictAccuracy(predict_list))

The Dict of Trained Model:
{('PetalLength', 2.45): {'<=': 'Iris-setosa', '>': {('PetalWidth', 1.75): {'<=': {('SepalLength', 4.95): {'<=': 'Iris-virginica', '>': 'Iris-versicolor'}}, '>': {('SepalLength', 5.95): {'<=': {('SepalWidth', 3.0): {'<=': 'Iris-virginica', '>': 'Iris-versicolor'}}, '>': 'Iris-virginica'}}}}}} 

The List of Predicting Outcomes (Actual Label, Predicted Value) :
[('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-setosa', 'Iris-setosa'), ('Iris-versicolor', 'Iris-virginica'), ('Iris-versicolor', 'Iris-versicolor'), ('Iris-versicolor', 'Iris-versicolor'), ('Iris-versicolor', 'Iris-versicolor'), ('Iris-versicolor', 'Iris-versicolor'), ('Iris-versicolor', 'Iris-versicolor'), ('Iris-versicolor', 'Iris-versicolor'), ('Iris-versi