<font color='blue'>Cell 1
Importing libraries

In [1]:
import numpy as np
import pandas as pd

<font color='blue'>Cell 2
Reading the data

In [2]:
data = pd.read_csv("./Data/data1.csv")
data.loc[np.r_[0:3, 51:53, 101:103], :]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
51,6.4,3.2,4.5,1.5,Versicolor
52,6.9,3.1,4.9,1.5,Versicolor
101,5.8,2.7,5.1,1.9,Virginica
102,7.1,3.0,5.9,2.1,Virginica


<font color='blue'>Cell 3
Dataset verification

In [18]:
def verify_dataset(data):
    # 결측치 확인을 위한 더미 변수 사용
    data_found = True

    for each_column in data.columns:
        if data[each_column].isnull().any():
            print("Data missing in Column " + each_column)
            data_found = False

    if data_found:
        print("Dataset is complete. No missing value")
    else:
        print("Dataset has missing values.")


# verify_dataset 함수 호출하여 데이터 검증
verify_dataset(data)

Dataset is complete. No missing value


<font color='blue'>Cell 4
Creating testing and training data sets

In [19]:
def split_dataset_train_test(data):
    """
    데이터셋을 훈련 세트와 테스트 세트로 나누는 함수

    Parameters:
    - data: 나눌 데이터셋

    Returns:
    - 훈련 세트와 테스트 세트를 담은 리스트
    """
    # 데이터셋을 섞습니다.
    data = data.sample(frac=1).reset_index(drop=True)

    # 훈련 세트와 테스트 세트로 나눕니다.
    training_data = data.iloc[: int(0.7 * len(data))].reset_index(drop=True)
    testing_data = data.iloc[int(0.7 * len(data)) :].reset_index(drop=True)

    return [training_data, testing_data]


# split_dataset_train_test 함수를 호출하여 훈련 세트와 테스트 세트를 반환합니다.
train_test_data = split_dataset_train_test(data)
print(train_test_data)

[     sepal.length  sepal.width  petal.length  petal.width     variety
0             7.7          3.0           6.1          2.3   Virginica
1             5.4          3.0           4.5          1.5  Versicolor
2             6.3          2.8           5.1          1.5   Virginica
3             5.1          3.4           1.5          0.2      Setosa
4             5.4          3.9           1.3          0.4      Setosa
..            ...          ...           ...          ...         ...
100           4.6          3.1           1.5          0.2      Setosa
101           5.0          3.0           1.6          0.2      Setosa
102           5.3          3.7           1.5          0.2      Setosa
103           5.6          3.0           4.5          1.5  Versicolor
104           5.5          2.3           4.0          1.3  Versicolor

[105 rows x 5 columns],     sepal.length  sepal.width  petal.length  petal.width     variety
0            4.8          3.0           1.4          0.1      Set

<font color='blue'>Cell 5
Calculate gini index for a given split 

In [20]:
def gini_index(data, target_col):
    """
    데이터셋의 지니 지수를 계산하는 함수

    Parameters:
    - data: 데이터셋
    - target_col: 타겟 열 (클래스 레이블)

    Returns:
    - 지니 지수
    """
    # 타겟 열의 고유한 값과 각 값의 빈도수를 계산합니다.
    elements, counts = np.unique(data[target_col], return_counts=True)

    # 전체 빈도수를 구합니다.
    total_counts = np.sum(counts)

    # 지니 지수를 계산합니다.
    sum_prob = np.sum((counts / total_counts) ** 2)
    gini_index = 1 - sum_prob

    return gini_index

<font color='blue'>Cell 6
Information gain

In [21]:
def information_gain(data, target_col, threshold, target_class="variety"):
    """
    데이터셋에서 특정 임계값을 기준으로 나눴을 때의 정보 이득을 계산하는 함수

    Parameters:
    - data: 데이터셋
    - target_col: 나눌 열
    - threshold: 임계값
    - target_class: 타겟 클래스 (기본값은 "variety")

    Returns:
    - 정보 이득
    """
    # 전체 데이터셋의 지니 지수를 계산합니다.
    total_gini_index = gini_index(data, target_class)

    # 임계값을 기준으로 데이터를 나눕니다.
    data_left = data[data[target_col] < threshold]
    data_right = data[data[target_col] >= threshold]

    # 나눈 후의 지니 지수를 계산합니다.
    weight_left = len(data_left) / len(data)
    weight_right = len(data_right) / len(data)
    gini_index_after_split = weight_left * gini_index(
        data_left, target_class
    ) + weight_right * gini_index(data_right, target_class)

    # 정보 이득을 계산합니다.
    info_gain = total_gini_index - gini_index_after_split

    return info_gain

<font color='blue'>Cell 7
Establish optimal splits based on the best features, best cutoffs, and best information gains

In [22]:
def select_best_feature_and_cutoff(data, target_class="variety"):
    """
    데이터셋에서 가장 좋은 특징과 해당 특징을 나눌 최적의 임계값을 선택하는 함수

    Parameters:
    - data: 데이터셋
    - target_class: 타겟 클래스 (기본값은 "variety")

    Returns:
    - 최적의 특징, 최적의 임계값, 최대 정보 이득을 담은 리스트
    """
    feature_list = list(data.columns)[:4]  # 첫 4개의 열만 고려합니다.
    best_feature = None
    best_cutoff = 0.0
    best_info_gain = 0.0
    
    for feature in feature_list:
        max_value = data[feature].max()
        min_value = data[feature].min()
        for cutoff in np.arange(min_value, max_value, 0.1):
            info_gain = information_gain(data, feature, cutoff, target_class)
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_cutoff = cutoff
                best_feature = feature

    return [best_feature, best_cutoff, best_info_gain]

In [7]:
def selectBestFeatureAndCutoff(data, target_class="variety"):
    featureList = list(data)[0:4]
    best_feature = "None"
    best_cutoff = 0.0
    best_info_gain = 0.0
    for feature in featureList:
        max_value = data[feature].max()
        min_value = data[feature].min()
        for cutoff in np.arange(min_value, max_value, 0.1):
            if best_info_gain < information_gain(data, feature, cutoff):
                best_info_gain = information_gain(data, feature, cutoff)
                best_cutoff = cutoff
                best_feature = feature

    return [best_feature, best_cutoff, best_info_gain]

<font color='blue'>Cell 8
Define the decision tree root (ie the first node), create the associated recursive splitting function, and create the associated prediction function


In [23]:
class Node:
    def __init__(self, feature, cut_off, label=None, is_leaf=False):
        """
        의사 결정 트리의 노드를 나타내는 클래스

        Parameters:
        - feature: 현재 노드의 특징
        - cut_off: 현재 노드의 임계값
        - label: 리프 노드의 클래스 레이블
        - is_leaf: 리프 노드 여부
        """
        self.feature = feature
        self.cut_off = cut_off
        self.left_child = None
        self.right_child = None
        self.is_leaf = is_leaf
        self.label = label


class DTree:
    def train(self, data):
        """
        의사 결정 트리를 훈련시키는 메서드

        Parameters:
        - data: 훈련 데이터셋
        """
        self.root = self.build_tree(data)

    def build_tree(self, data):
        """
        의사 결정 트리를 구축하는 메서드

        Parameters:
        - data: 현재 노드의 훈련 데이터셋

        Returns:
        - 구축된 의사 결정 트리의 루트 노드
        """
        best_feature, best_cutoff, best_info_gain = select_best_feature_and_cutoff(data)
        
        # 모든 데이터가 같은 레이블을 가지고 있으면 리프 노드를 반환합니다.
        if len(np.unique(data["variety"])) == 1:
            return Node(best_feature, best_cutoff, data["variety"].iloc[0], True)

        # 데이터를 분할합니다.
        data_left = data[data[best_feature] < best_cutoff]
        data_right = data[data[best_feature] >= best_cutoff]

        # 현재 노드를 생성합니다.
        current_node = Node(best_feature, best_cutoff)
        # 왼쪽 자식 노드를 추가합니다.
        current_node.left_child = self.build_tree(data_left)
        # 오른쪽 자식 노드를 추가합니다.
        current_node.right_child = self.build_tree(data_right)

        return current_node

    def predict(self, data):
        """
        의사 결정 트리를 사용하여 예측하는 메서드

        Parameters:
        - data: 예측할 데이터

        Returns:
        - 예측 결과
        """
        current_node = self.root
        while True:
            if current_node.is_leaf:
                return current_node.label
            
            # 다음 노드로 이동합니다.
            feature = current_node.feature
            cutoff = current_node.cut_off
            if data[feature] < cutoff:
                current_node = current_node.left_child
            else:
                current_node = current_node.right_child

In [8]:
class Node:
    def __init__(self, feature, cut_off, label=None, is_leaf=False):
        self.feature = feature
        self.cut_off = cut_off
        self.left_child = None
        self.right_child = None
        self.is_leaf = is_leaf
        self.label = label
        # print("node's label: ")
        # print(self.label)


class DTree:
    # method to train a decision tree
    def train(self, data):
        self.root = self.build_tree(data)

    # method to build decision tree
    def build_tree(self, data):
        best_feature, best_cutoff, best_info_gain = selectBestFeatureAndCutoff(data)
        # if all data has the same label , we are at a leaf node
        if len(np.unique(data["variety"])) == 1:
            # print(data["variety"].iloc[0])
            return Node(best_feature, best_cutoff, data["variety"].iloc[0], True)

        # if we are not the leaf
        # first lets split data
        data_left = data[data[best_feature] < best_cutoff]
        data_right = data[data[best_feature] >= best_cutoff]

        # build current node
        current_node = Node(best_feature, best_cutoff)
        # add left node
        current_node.left_child = self.build_tree(data_left)
        # add right node
        current_node.right_child = self.build_tree(data_right)

        return current_node

    # Make a prediction with a decision tree
    def predict(self, data):
        current_node = self.root
        while True:
            # if we are at the leaf node , return label
            if current_node.is_leaf == True:
                return current_node.label
            # otherwise we need figure out where to go next
            feature = current_node.feature
            cutoff = current_node.cut_off
            if data[feature] < cutoff:
                current_node = current_node.left_child
            else:
                current_node = current_node.right_child

<font color='blue'>Cell 9
Train the decision tree

In [24]:
d_tree = DTree()  # 의사 결정 트리 객체 생성
training_data = testtrain[0]  # 훈련 데이터 가져오기
d_tree.train(training_data)  # 의사 결정 트리 훈련


<font color='blue'>Cell 10
Define the confusion matrix

In [26]:
def print_confusion_matrix(result):
    count_SS, count_SVi, count_SVe, count_ViVi, count_ViVe, count_ViS, count_VeVe, count_VeVi, count_VeS = result[:9]
    
    # 각 클래스별 정확도 계산
    observed_setosa = count_SS / (count_SS + count_ViS + count_VeS)
    observed_virginica = count_VeVi / (count_SVi + count_ViVi + count_VeVi)
    observed_versicolor = count_ViVi / (count_SVe + count_ViVe + count_VeVe)
    
    # 데이터프레임 생성
    data = {
        "predict_Observe": ["Setosa (predict)", "Virginica (predict)", "Versicolor (predict)"],
        "Setosa (observed)": [observed_setosa, count_SVi / (count_SVi + count_ViVi + count_VeVi), count_SVe / (count_SVe + count_ViVe + count_VeVe)],
        "Virginica (observed)": [count_ViS / (count_SS + count_ViS + count_VeS), observed_virginica, count_ViVe / (count_SVe + count_ViVe + count_VeVe)],
        "Versicolor (observed)": [count_VeS / (count_SS + count_ViS + count_VeS), count_VeVi / (count_SVi + count_ViVi + count_VeVi), observed_versicolor],
    }

    output = pd.DataFrame(data)
    return output


In [17]:
# def print_ConfusionMatrix(result):
#     count_SS = result[0]
#     count_SVi = result[1]
#     count_SVe = result[2]
#     count_ViVi = result[3]
#     count_ViVe = result[4]
#     count_ViS = result[5]
#     count_VeVe = result[6]
#     count_VeVi = result[7]
#     count_VeS = result[8]
#     data = {
#         "predict_Observe": ["Setosa (predict)", "Virginica (predict)", "Versicolor (predict)"],
#         "Setosa (observed)": [
#             count_SS / (count_SS + count_ViS + count_VeS),
#             count_SVi / (count_SVi + count_ViVi + count_VeVi),
#             count_SVe / (count_SVe + count_ViVe + count_VeVe),
#         ],
#         "Virginica (observed)": [
#             count_ViS / (count_SS + count_ViS + count_VeS),
#             count_ViVi / (count_SVi + count_ViVi + count_VeVi),
#             count_ViVe / (count_SVe + count_ViVe + count_VeVe),
#         ],
#         "Versicolor (observed)": [
#             count_VeS / (count_SS + count_ViS + count_VeS),
#             count_VeVi / (count_SVi + count_ViVi + count_VeVi),
#             count_VeVe / (count_SVe + count_ViVe + count_VeVe),
#         ],
#     }

#     output = pd.DataFrame(
#         data,
#         columns=[
#             "predict_Observe",
#             "Setosa (observed)",
#             "Virginica (observed)",
#             "Versicolor (observed)",
#         ],
#     )
#     return output

<font color='blue'>Cell 11
Create the confusion matrix

In [25]:
# def predict_batch(data, training_data):
#     """
#     데이터셋에 대한 배치 예측을 수행하고, 정확도와 오분류된 샘플 수를 계산하는 함수

#     Parameters:
#     - data: 예측할 데이터셋
#     - training_data: 훈련 데이터셋

#     Returns:
#     - 각 클래스별 정확도와 총 정확도, 오분류된 샘플 수를 담은 리스트
#     """
#     d_tree = DTree()  # 의사 결정 트리 객체 생성
#     d_tree.train(training_data)  # 훈련 데이터로 트리 훈련

#     # 각 클래스별 정확도와 총 정확도, 오분류된 샘플 수를 저장할 변수 초기화
#     counts = {
#         "Setosa": {"SS": 0, "SVi": 0, "SVe": 0},
#         "Versicolor": {"ViVi": 0, "ViVe": 0, "ViS": 0},
#         "Virginica": {"VeVe": 0, "VeVi": 0, "VeS": 0},
#         "total_T": 0,
#         "total_F": 0,
#     }

#     for i in range(data.shape[0]):
#         instance = data.iloc[i]
#         true_label = instance["variety"]
#         predict_label = d_tree.predict(instance)

#         if true_label == predict_label:
#             counts[true_label]["total_T"] += 1
#             counts["total_T"] += 1
#         else:
#             counts[true_label][f"{true_label[0]}{predict_label}"] += 1
#             counts["total_F"] += 1
#     return [
#         counts["Setosa"]["SS"],
#         counts["Setosa"]["SVi"],
#         counts["Setosa"]["SVe"],
#         counts["Versicolor"]["ViVi"],
#         counts["Versicolor"]["ViVe"],
#         counts["Versicolor"]["ViS"],
#         counts["Virginica"]["VeVe"],
#         counts["Virginica"]["VeVi"],
#         counts["Virginica"]["VeS"],
#         counts["total_T"],
#         counts["total_F"],
#     ]


In [30]:
def predict_batch(data):
    d_tree = DTree()
    d_tree.train(training_data)
    count_SS = 0
    count_SVi = 0
    count_SVe = 0
    count_ViVi = 0
    count_ViS = 0
    count_ViVe = 0
    count_VeVe = 0
    count_VeS = 0
    count_VeVi = 0
    count_total_T = 0
    count_total_F = 0

    for i in range(data.shape[0]):
        instance = data.iloc[i]
        true_label = instance["variety"]
        predict_label = d_tree.predict(data.iloc[i])
        if true_label == predict_label:
            count_total_T = count_total_T + 1
            if true_label == "Setosa":
                count_SS = count_SS + 1
            elif true_label == "Versicolor":
                count_ViVi = count_ViVi + 1
            elif true_label == "Virginica":
                count_VeVe = count_VeVe + 1
        else:
            count_total_F = count_total_F + 1
            if true_label == "Setosa" and predict_label == "Virginica":
                count_SVi = count_SVi + 1
            elif true_label == "Setosa" and predict_label == "Versicolor":
                count_SVe = count_SVe + 1
            elif true_label == "Versicolor" and predict_label == "Virginica":
                count_VeVi = count_VeVi + 1
            elif true_label == "Versicolor" and predict_label == "Setosa":
                count_VeS = count_VeS + 1
            elif true_label == "Virginica" and predict_label == "Versicolor":
                count_ViVe = count_ViVe + 1
            elif true_label == "Virginica" and predict_label == "Setosa":
                count_ViS = count_ViS + 1

    return [
        count_SS,
        count_SVi,
        count_SVe,
        count_ViVi,
        count_ViVe,
        count_ViS,
        count_VeVe,
        count_VeVi,
        count_VeS,
        count_total_T,
        count_total_F,
    ]

<font color='blue'>Cell 12
Look at the confusion matrix for training data

In [31]:
training_data = testtrain[0]  # 훈련 데이터 가져오기
confusion_matrix = print_ConfusionMatrix(predict_batch(training_data))  # 혼동 행렬 생성
print(confusion_matrix)  # 혼동 행렬 출력

        predict_Observe  Setosa (observed)  Virginica (observed)  \
0      Setosa (predict)                1.0                   0.0   
1   Virginica (predict)                0.0                   1.0   
2  Versicolor (predict)                0.0                   0.0   

   Versicolor (observed)  
0                    0.0  
1                    0.0  
2                    1.0  


<font color='blue'>Cell 13
Look at the confusion matrix for testing data

In [13]:
testing_data = testtrain[1]
print_ConfusionMatrix(predict_batch(testing_data))

Unnamed: 0,predict\Observe,Setosa (observed),Virginica (observed),Versicolor (observed)
0,Setosa (predict),1.0,0.0,0.0
1,Virginica (predict),0.0,0.882353,0.117647
2,Versicolor (predict),0.0,0.083333,0.916667


<font color='blue'>Cell 14
Function to make predictions

In [14]:
# method that run prediction
def predict(d_tree, sepal_length, sepal_width, petal_length, petal_width):
    test_data = pd.Series(
        [sepal_length, sepal_width, petal_length, petal_width],
        index=["sepal.length", "sepal.width", "petal.length", "petal.width"],
    )
    return d_tree.predict(test_data)

<font color='blue'> Exercise 2.1


In [16]:
def input_test_seq():
    sepal_length = float(input("Enter the Sepal length in cm :"))
    while True:
        if float(sepal_length) < 0 or float(sepal_length) > 10:
            print("Inalid Entry. Please enter value less than 10")
            sepal_length = float(input("Enter the sepal length in cm :"))
            continue
        else:
            break

    sepal_width = float(input("Enter the Sepal width in cm :"))
    while True:
        if float(sepal_width) < 0 or float(sepal_width) > 10:
            print("Inalid Entry. Please enter value less than 10")
            sepal_width = float(input("Enter the sepal width in cm :"))
            continue
        else:
            break

    petal_length = float(input("Enter the petal length in cm :"))
    while True:
        if float(petal_length) < 0 or float(petal_length) > 10:
            print("Inalid Entry. Please enter value less than 10")
            petal_length = float(input("Enter the petal length in cm :"))
            continue
        else:
            break

    petal_width = float(input("Enter the petal width in cm :"))
    while True:
        if float(petal_width) < 0 or float(petal_width) > 10:
            print("Inalid Entry. Please enter value less than 10")
            petal_width = float(input("Enter the petal width in cm :"))
            continue
        else:
            break

    predict_features = [sepal_length, sepal_width, petal_length, petal_width]
    result_category = predict(
        d_tree, predict_features[0], predict_features[1], predict_features[2], predict_features[3]
    )
    print("This flower is a ", result_category)

    return


input_test_seq()

This flower is a  Setosa
