In [13]:
import pandas as pd
import numpy as np
import math
from anytree import Node, RenderTree, find, Walker,DoubleStyle,LevelOrderIter,findall

In [2]:
class Feature:
    def __init__(self, name=None, unique=None,info=0.0,
                 df=None,gain=0.0,split_info=0.0):
        
        self.name = name
        self.unique = unique
        self.info = info
        self.gain = gain
        self.split_info = split_info
        self.gain_ratio = 0.0
        self.dataset = df

In [4]:
# class FeatureVector:
#     def __init__(feature=None,label=None,count=0):
#         self.feature = feature
#         self.label = label
#         self.count = count

In [86]:
class C45:
    def __init__(self):
        self.except_features = []
        self.feature_list={}
        self.selected_feature=[]
        self.tree = None
        self.ROOT = 'root'
        self.LABEL = 'label'
        self.DECISION = 'class'
        self.VALUE = 'value'        
        
    def read_csv(self,filename):
        df = pd.read_table(filename, sep=';', engine='python')
        self.label_name = df.columns[-1]
        self.number_of_entries = len(df)
        self.df = df
        
    def remove_feature(self,feature):
        if feature not in self.except_features:
            self.except_features.append(feature)
            self.features = [item for item in self.df.columns if item not in self.except_features]
#             self.df = self.df.drop(feature,axis=1)
        else:
            print(f'{feature} is removed!')
        
    def identify_feature(self):
        except_features = self.except_features
        df = self.df
        for col in df:
            if col not in except_features:
                feature = Feature(name=col,unique=df[col].unique())
                self.feature_list[col] = feature
#         for key in self.feature_list:
#             subdf = self.df[[key,self.label_name]]
#             self.feature_list[key].dataset = subdf
    
    def log2(self,x):
        if x == 0:
            return 0
        else:
            return math.log(x,2)
        
    def calc_info(self,featureObj,labelObj,df):
#         print(f'Processing {featureObj.name}')
        number_of_entries = len(df)
        sum_info = 0.0
        classP = 0.0
        if featureObj == labelObj:
            info = 0.0
            for label_value in labelObj.unique:
                idxs = df[(df[labelObj.name]==label_value)].index
                occur = len(idxs)
#                 print('occur:',occur)
                valueP = float(occur)/number_of_entries
                info = info - (valueP * ( self.log2(valueP) ) )
            return info
        
        for feature_value in featureObj.unique:
            info = 0.0
            idxs = df[(df[featureObj.name]==feature_value)].index
            Dj = len(idxs)
#             print(f'Occurance: {Dj}, ClassP: {classP}')
            classP = float(Dj)/number_of_entries
            for label_value in labelObj.unique:
                idxs = df[(df[featureObj.name]==feature_value) & (df[labelObj.name]==label_value)].index
                occur = len(idxs)
#                 print(f'{feature_value} {label_value} {occur}/{Dj}')
                valueP = float(occur)/Dj
                info = info - (valueP * ( self.log2(valueP) ) )
            split_info = classP * info
            sum_info = sum_info + split_info
#         print(f'Info {featureObj.name}(D) = {sum_info}')
#         print('========================================================')
        return sum_info

    def find_feature(self,dataset):
        feature_list = {}
        for col in dataset:
            if col not in self.except_features:
                feature = Feature(name=col,unique=dataset[col].unique())
                feature_list[col] = feature
        return feature_list

    def best_feature(self,feature_list):
        bestGain = 0.0
        for key in feature_list: 
            featureObj = feature_list[key]
            if featureObj.gain_ratio > bestGain:
                bestFeature = featureObj
                bestGain = featureObj.gain_ratio
        if bestGain == 0.0:
            return -99
        return bestFeature
    
    def find_best_features(self,feature_list,df):
        labelObj = feature_list[self.label_name]
        for key in feature_list:
            featureObj = feature_list[key]
            featureObj.info = self.calc_info(featureObj,labelObj,df)
            featureObj.split_info = self.calc_info(featureObj,featureObj,df)
        labelObj = feature_list[self.label_name]
        for key in feature_list:
            featureObj = feature_list[key]
            if featureObj == labelObj:
                continue
            featureObj.gain = labelObj.info - featureObj.info
            if(featureObj.gain != 0.0):
                featureObj.gain_ratio = featureObj.gain / featureObj.split_info
            else:
                featureObj.gain_ratio = 0.0
            print(f'{featureObj.name} info={featureObj.info:.4f} gain={featureObj.gain:.4f} split_info={featureObj.split_info:.4f} gain_ratio={featureObj.gain_ratio:.4f}')
        return feature_list

    def find_best_label(self,labelObj,df):
        count = 0
        bestLabel = labelObj.unique[0]
        for value in labelObj.unique:
            idxs = df[(df[labelObj.name]==value)].index
            newCount = len(idxs)
            if newCount > count:
                bestLabel = value
        return bestLabel
    
    def split_dataset(self,name,value,dataset):
        dataset = dataset.loc[(dataset[name]==value)]
        dataset = dataset.drop(name,axis=1)
        return dataset
    
    def create_value_node(self,feature,df,currentNode):
        for value in feature.unique:
            dataset = self.split_dataset(feature.name,value,df)
            newNode = Node(value,parent=currentNode,dataset=dataset,type=self.VALUE)
            
    def create_tree(self):
        # feature_list = model.feature_list
        print('Identifing first feature...')
        feature_list = self.find_feature(self.df)
        feature_list = self.find_best_features(feature_list,self.df)
        bestFeature = self.best_feature(feature_list)
        root = Node(bestFeature.name,type=self.ROOT)
        print(f'Best feature: {bestFeature.name}')
        for value in bestFeature.unique:
            dataset = self.split_dataset(bestFeature.name,value,self.df)
        #     dataset = model.df.loc[(model.df[bestFeature.name]==value)]
        #     dataset = dataset.drop(bestFeature.name,axis=1)
            newNode = Node(value,parent=root,dataset=dataset,type=self.VALUE)
        self.tree = root
        self.display_tree()
        for node in LevelOrderIter(root):
            print('=================================')
            print(f'Node: {node.name} Type:{node.type}')
            if node != root and node.type != self.LABEL and node.type != self.DECISION:
        #         print(f'Node: {node.name}')
                print(node.dataset)
                feature_list = self.find_feature(node.dataset)
#                 print(f'Length: {len(feature_list)}')
                feature_list = self.find_best_features(feature_list,node.dataset)
                bestFeature = self.best_feature(feature_list)
                if(bestFeature != -99):
                    print(f"Best feature: {bestFeature.name}")
                    newNode = Node(bestFeature.name,parent=node,type=self.DECISION)
                    self.create_value_node(bestFeature,node.dataset,newNode)
                else:
                    labelObj = feature_list[model.label_name]
                    best_label = self.find_best_label(labelObj,node.dataset)
                    print(f'Selected label: {best_label}')
                    newNode = Node(best_label,parent=node,type=self.LABEL)
                self.display_tree()
            else:
                print(f'Skip {node.name} {node.type}')
                continue
        return root

    def display_tree(self):
        for pre,_,node in RenderTree(self.tree,DoubleStyle):
                print("%s%s" % (pre, node.name))    
        
    def display_feature_list(self):
        for key in feature_list:
            print('======================================')
            print(f'Feature name: {feature_list[key].name}')
            print(f'Unique: {feature_list[key].unique}')
            print(f'Info Value: {feature_list[key].entropy}')
            print(f'Dataset: {feature_list[key].dataset}')
            print('======================================')
                
    def info(self):
        self.features = [item for item in self.df.columns if item not in self.except_features]
        print(f'Remove feature: {self.except_features}')
        print(f'Available feature: {self.features}')
        print(f'Number of entries: {self.number_of_entries}')

In [87]:
model = C45()
model.read_csv('student_train.txt')
model.remove_feature('RID')
# model.identify_feature()
model.info()
print('============================================')
model.create_tree()
model.display_tree()

Remove feature: ['RID']
Available feature: ['age', 'income', 'student', 'credit_rating', 'buys_computer']
Number of entries: 10
Identifing first feature...
age info=0.6490 gain=0.3219 split_info=1.5219 gain_ratio=0.2115
income info=0.8755 gain=0.0955 split_info=1.5710 gain_ratio=0.0608
student info=0.8464 gain=0.1245 split_info=1.0000 gain_ratio=0.1245
credit_rating info=0.8797 gain=0.0913 split_info=0.8813 gain_ratio=0.1036
Best feature: age
age
╠══ youth
╠══ middle_aged
╚══ senior
Node: age Type:root
Skip age root
Node: youth Type:value
   RID  income student credit_rating buys_computer
0    1    high      no          fair            no
1    2    high      no     excellent            no
7    8  medium      no          fair            no
8    9     low     yes          fair           yes
income info=0.0000 gain=0.8113 split_info=1.5000 gain_ratio=0.5409
student info=0.0000 gain=0.8113 split_info=0.8113 gain_ratio=1.0000
credit_rating info=0.6887 gain=0.1226 split_info=0.8113 gain_rati

In [96]:
def read_testset(file):
    df = pd.read_table(file, sep=';', engine='python')
    label_name = df.columns[-1]
    number_of_entries = len(df)
    features = [item for item in df.columns if item != label_name]
    return label_name,number_of_entries,df,features

In [102]:
label_name,number_of_entries,dataset,features = read_testset('student_test.txt')
print(f'Number of entries: {number_of_entries}')
print(f'Label: {label_name}')
print(f'Features: {features}')
print(dataset)

Number of entries: 4
Label: buys_computer
Features: ['RID', 'age', 'income', 'student', 'credit_rating']
   RID          age  income student credit_rating buys_computer
0   11        youth  medium     yes     excellent           yes
1   12  middle_aged  medium      no     excellent           yes
2   13  middle_aged    high     yes          fair           yes
3   14       senior  medium      no     excellent            no


In [591]:
def best_feature(feature_list):
    bestGain = 0.0
    for key in feature_list: 
        featureObj = feature_list[key]
        if featureObj.gain_ratio > bestGain:
            bestFeature = featureObj
            bestGain = featureObj.gain_ratio
    if bestGain == 0.0:
        return -99
    return bestFeature

In [593]:
def find_best_features(feature_list,df):
    labelObj = feature_list[model.label_name]
    for key in feature_list:
        featureObj = feature_list[key]
        featureObj.info = model.calc_info(featureObj,labelObj,df)
        featureObj.split_info = model.calc_info(featureObj,featureObj,df)
    labelObj = feature_list[model.label_name]
    for key in feature_list:
        featureObj = feature_list[key]
        if featureObj == labelObj:
            continue
        featureObj.gain = labelObj.info - featureObj.info
        featureObj.gain_ratio = featureObj.gain / featureObj.split_info
        print(featureObj.name,featureObj.info,featureObj.gain,featureObj.split_info,featureObj.gain_ratio)
    return feature_list

In [615]:
ROOT = 'root'
LABEL = 'label'
DECISION = 'class'
VALUE = 'value'

In [595]:
def find_feature(dataset):
    feature_list = {}
    for col in dataset:
        if col not in model.except_features:
            feature = Feature(name=col,unique=dataset[col].unique())
            feature_list[col] = feature
    return feature_list

In [596]:
def find_best_label(labelObj,df):
    count = 0
    bestLabel = labelObj.unique[0]
    for value in labelObj.unique:
        idxs = df[(df[labelObj.name]==value)].index
        newCount = len(idxs)
        if newCount > count:
            bestLabel = value
    return bestLabel

In [597]:
def split_dataset(name,value,dataset):
    dataset = dataset.loc[(dataset[name]==value)]
    dataset = dataset.drop(name,axis=1)
    return dataset

In [598]:
def create_value_node(feature,df,currentNode):
    for value in feature.unique:
        dataset = split_dataset(feature.name,value,df)
        newNode = Node(value,parent=currentNode,dataset=dataset,type=VALUE)

In [626]:
for pre,_,node in RenderTree(root,DoubleStyle):
        print("%s%s" % (pre, node.name))

age
╠══ youth
║   ╚══ student
║       ╠══ no
║       ║   ╚══ no
║       ╚══ yes
║           ╚══ yes
╠══ middle_aged
║   ╚══ yes
╚══ senior
    ╚══ credit_rating
        ╠══ fair
        ║   ╚══ yes
        ╚══ excellent
            ╚══ no


In [625]:
# feature_list = model.feature_list
feature_list = find_feature(model.df)
feature_list = find_best_features(feature_list,model.df)
bestFeature = best_feature(feature_list)
root = Node(bestFeature.name,type=ROOT)
for value in bestFeature.unique:
    dataset = split_dataset(bestFeature.name,value,model.df)
#     dataset = model.df.loc[(model.df[bestFeature.name]==value)]
#     dataset = dataset.drop(bestFeature.name,axis=1)
    newNode = Node(value,parent=root,dataset=dataset,type=VALUE)
for node in LevelOrderIter(root):
    print(f'Node: {node.name} Type:{node.type}')
    if node != root and node.type != LABEL and node.type != DECISION:
#         print(f'Node: {node.name}')
        print(node.dataset,node.type)
        feature_list = find_feature(node.dataset)
        print(f'Length: {len(feature_list)}')
        feature_list = find_best_features(feature_list,node.dataset)
        bestFeature = best_feature(feature_list)
        if(bestFeature != -99):
            print(f"Best feature: {bestFeature.name}")
            newNode = Node(bestFeature.name,parent=node,type=DECISION)
            create_value_node(bestFeature,node.dataset,newNode)
        else:
            labelObj = feature_list[model.label_name]
            best_label = find_best_label(labelObj,node.dataset)
            print(f'Selected label: {best_label}')
            newNode = Node(best_label,parent=node,type=LABEL)
    print('=================================')

age 0.6935361388961918 0.2467498197744391 1.5774062828523452 0.15642756242117517
income 0.9110633930116763 0.029222565658954647 1.5566567074628228 0.01877264622241867
student 0.7884504573082896 0.15183550136234136 1.0 0.15183550136234136
credit_rating 0.8921589282623617 0.04812703040826927 0.9852281360342516 0.048848615511520595
Node: age Type:root
Node: youth Type:value
    income student credit_rating buys_computer
0     high      no          fair            no
1     high      no     excellent            no
7   medium      no          fair            no
8      low     yes          fair           yes
10  medium     yes     excellent           yes value
Length: 4
income 0.4 0.5709505944546686 1.5219280948873621 0.37514952012034747
student 0.0 0.9709505944546686 0.9709505944546686 1.0
credit_rating 0.9509775004326937 0.01997309402197489 0.9709505944546686 0.020570659450692974
Best feature: student
Node: middle_aged Type:value
    income student credit_rating buys_computer
2     high    

In [None]:
## developed for find_best_featrues method
# feature_list = model.feature_list
# labelObj = feature_list[model.label_name]
# selected_features = []
# for key in feature_list:
#     featureObj = feature_list[key]
#     featureObj.info = model.calc_info(featureObj,labelObj,model.df)
#     featureObj.split_info = model.calc_info(featureObj,featureObj,model.df)
# labelObj = feature_list[model.label_name]
# for key in feature_list:
#     featureObj = feature_list[key]
#     if featureObj == labelObj:
#         continue
#     featureObj.gain = labelObj.info - featureObj.info
#     featureObj.gain_ratio = featureObj.gain / featureObj.split_info
#     print(featureObj.name,featureObj.info,featureObj.gain,featureObj.split_info,featureObj.gain_ratio)
# bestFeature = best_feature(feature_list)
# selected_features.append(bestFeature)
# print('Best feature:',bestFeature.name)

In [274]:
## old find best features method
# def find_best_features(feature_list,labelObj,df):
#     for key in feature_list:
#         featureObj = feature_list[key]
#         featureObj.info = model.calc_info(featureObj,labelObj,df)
#         featureObj.split_info = model.calc_info(featureObj,featureObj,df)
#     for key in feature_list:
#         featureObj = feature_list[key]
#         if featureObj == labelObj:
#             continue
#         featureObj.gain = labelObj.info - featureObj.info
#         featureObj.gain_ratio = featureObj.gain / featureObj.split_info
#         print(featureObj.name,featureObj.info,featureObj.gain,featureObj.split_info,featureObj.gain_ratio)
#     bestFeature = best_feature(feature_list)
#     return bestFeature

In [463]:
# feature_list = model.feature_list
# feature_list = find_best_features(feature_list,model.df)
# bestFeature = best_feature(feature_list)
# print('Best feature:',bestFeature.name)

age 0.6935361388961918 0.2467498197744391 1.5774062828523452 0.15642756242117517
income 0.9110633930116763 0.029222565658954647 1.5566567074628228 0.01877264622241867
student 0.7884504573082896 0.15183550136234136 1.0 0.15183550136234136
credit_rating 0.8921589282623617 0.04812703040826927 0.9852281360342516 0.048848615511520595
Best feature: age


In [572]:
for pre,_,node in RenderTree(root,DoubleStyle):
    try:
        print("%s%s\n%s" % (pre, node.name,node.dataset))
    except:
        print("%s%s" % (pre, node.name))

Outlook
╠══ Sunny
   Temp. Humidity    Wind Decision
0    Hot     High    Weak       No
1    Hot     High  Strong       No
7   Mild     High    Weak       No
8   Cool   Normal    Weak      Yes
10  Mild   Normal  Strong      Yes
║   ╚══ Humidity
║       ╠══ High
  Temp.    Wind Decision
0   Hot    Weak       No
1   Hot  Strong       No
7  Mild    Weak       No
║       ║   ╚══ No
║       ╚══ Normal
   Temp.    Wind Decision
8   Cool    Weak      Yes
10  Mild  Strong      Yes
║           ╚══ Yes
╠══ Overcast
   Temp. Humidity    Wind Decision
2    Hot     High    Weak      Yes
6   Cool   Normal  Strong      Yes
11  Mild     High  Strong      Yes
12   Hot   Normal    Weak      Yes
║   ╚══ Yes
╚══ Rain
   Temp. Humidity    Wind Decision
3   Mild     High    Weak      Yes
4   Cool   Normal    Weak      Yes
5   Cool   Normal  Strong       No
9   Mild   Normal    Weak      Yes
13  Mild     High  Strong       No
    ╚══ Wind
        ╠══ Weak
  Temp. Humidity Decision
3  Mild     High      Yes
4

In [573]:
for value in bestFeature.unique:
    print(value)
    dataset = model.df.loc[(model.df[bestFeature.name]==value)]
    print(dataset)

AttributeError: 'int' object has no attribute 'unique'

In [20]:
bestFeature.name

'age'

In [14]:
print(feature_list)

{'income': <__main__.Feature object at 0x000001D6F20F4B50>, 'student': <__main__.Feature object at 0x000001D6CFF6AB80>, 'credit_rating': <__main__.Feature object at 0x000001D6F20F4F70>, 'buys_computer': <__main__.Feature object at 0x000001D6F20F42E0>}


In [917]:
for key in feature_list:
    print(feature_list[key].name)

age
income
student
credit_rating
buys_computer


In [733]:
df = model.df
number_of_entries = len(df)
info = 0.0
for label_value in labelObj.unique:
    idxs = df[(df[labelObj.name]==label_value)].index
    occur = len(idxs)
    print('occur:',occur)
    valueP = float(occur)/number_of_entries
    info = info - (valueP * ( log2(valueP) ) )
print(info)

14
occur: 5
occur: 9
0.9402859586706309


In [9]:
def log2(x):
    if x == 0:
        return 0
    else:
        return math.log(x,2)

In [664]:
def calc_info(featureObj,labelObj):
#     featureObj = model.feature_list['age']
#     labelObj = model.feature_list['buys_computer']
    print(f'Processing {featureObj.name}')
    df = model.df
    sum_info = 0.0
    classP = 0.0
    for feature_value in featureObj.unique:
        info = 0.0
        idxs = df[(df[featureObj.name]==feature_value)].index
        Dj = len(idxs)
        print(f'Occurance: {Dj}, ClassP: {classP}')
        classP = float(Dj)/model.number_of_entries
    #     print(f'{Dj}/{model.number_of_entries}')
        for label_value in labelObj.unique:
            idxs = df[(df[featureObj.name]==feature_value) & (df[labelObj.name]==label_value)].index
            occur = len(idxs)
            print(f'{feature_value} {label_value} {occur}/{Dj}')
            valueP = float(occur)/Dj
            info = info - (valueP * ( log2(valueP) ) )
        split_info = classP * info
        sum_info = sum_info + split_info
    print(f'Info {featureObj.name}(D) = {sum_info}')
    print('========================================================')
    return info

In [666]:
labelObj = feature_list[model.label_name]
for key in feature_list:
    featureObj = feature_list[key]
    featureObj.info = model.calc_info(featureObj,labelObj,model.df)

Processing age
Occurance: 5, ClassP: 0.0
youth no 3/5
youth yes 2/5
Occurance: 4, ClassP: 0.35714285714285715
middle_aged no 0/4
middle_aged yes 4/4
Occurance: 5, ClassP: 0.2857142857142857
senior no 2/5
senior yes 3/5
Info age(D) = 0.6935361388961918
Processing income
Occurance: 4, ClassP: 0.0
high no 2/4
high yes 2/4
Occurance: 6, ClassP: 0.2857142857142857
medium no 2/6
medium yes 4/6
Occurance: 4, ClassP: 0.42857142857142855
low no 1/4
low yes 3/4
Info income(D) = 0.9110633930116763
Processing student
Occurance: 7, ClassP: 0.0
no no 4/7
no yes 3/7
Occurance: 7, ClassP: 0.5
yes no 1/7
yes yes 6/7
Info student(D) = 0.7884504573082896
Processing credit_rating
Occurance: 8, ClassP: 0.0
fair no 2/8
fair yes 6/8
Occurance: 6, ClassP: 0.5714285714285714
excellent no 3/6
excellent yes 3/6
Info credit_rating(D) = 0.8921589282623617
Processing buys_computer
Occurance: 5, ClassP: 0.0
no no 5/5
no yes 0/5
Occurance: 9, ClassP: 0.35714285714285715
yes no 0/9
yes yes 9/9
Info buys_computer(D) = 

In [596]:
featureObj = model.feature_list['age']
labelObj = model.feature_list['buys_computer']
print(f'Processing {featureObj.name}')
sum_info = 0.0
for feature_value in featureObj.unique:
    info = 0.0
    idxs = df[(df[feature_obj.name]==feature_value)].index
    Dj = len(idxs)
    print(f'Occurance: {Dj}, ClassP: {classP}')
    classP = float(Dj)/model.number_of_entries
#     print(f'{Dj}/{model.number_of_entries}')
    for label_value in labelObj.unique:
        idxs = df[(df[feature_obj.name]==feature_value) & (df[label_name]==label_value)].index
        occur = len(idxs)
        valueP = float(occur)/Dj
        info = info - (valueP * ( log2(valueP) ) )
        print(feature_value,label_value,occur)
    split_info = classP * info
    sum_info = sum_info + split_info
print(f'Info {featureObj.name}(D) = {sum_info}')
print('========================================================')

Processing age
Occurance: 5, ClassP: 0.35714285714285715
youth no 3
youth yes 2
Occurance: 4, ClassP: 0.35714285714285715
middle_aged no 0
middle_aged yes 4
Occurance: 5, ClassP: 0.2857142857142857
senior no 2
senior yes 3
Info age(D) = 0.6935361388961918


In [509]:
df = model.df
feature_list = model.feature_list
for key in feature_list:
    print(key)
    subdf = df[[key,label_name]]
    feature_list[key].dataset = subdf

age
income
student
credit_rating
buys_computer


In [487]:
feaVec=[]
feature_list = model.feature_list
label_name = model.label_name
df = model.df
for key in feature_list:
    if key != label_name:
        print(f'---{key}---')
        feature_obj = feature_list[key]
        for feature_value in feature_obj.unique:
                print(feature_value)
#                 idxs = df[(df[feature_obj.name]==feature_value) & (df[label_name]==label_value)].index
#                 subdf = df.loc[(df[feature_obj.name]==feature_value) & (df[label_name]==label_value)]
                subdf = df[[feature_obj.name,label_name]]
                print(subdf)
#                 count = len(dataset)
#                 probability = float(count/model.number_of_entries)
#                 print(feature_value,label_value,probability)

---age---
youth


KeyError: "['youth'] not in index"

Developed for read file

In [272]:
except_features = []
df = pd.read_table('student_train.txt', sep=';', engine='python')
label_name = df.columns[-1]
except_features.append('RID')
number_of_entries = len(df)

In [287]:
print(except_features)
print(number_of_features,label_name)
print(df.columns)
print(number_of_entries)

['RID']
6 buys_computer
Index(['RID', 'age', 'income', 'student', 'credit_rating', 'buys_computer'], dtype='object')
14


Identify Feature

In [276]:
feature_list={}
for col in df:
    if col not in except_features:
        feature = Feature(name=col,unique=df[col].unique())
        feature_list[col] = feature
#     feature_list[col] = df[col].unique()

In [427]:
for key in feature_list:
    print(feature_list[key].unique)

['youth' 'middle_aged' 'senior']
['high' 'medium' 'low']
['no' 'yes']
['fair' 'excellent']
['no' 'yes']


Develop feature vector to count the gain

In [454]:
feaVec=[]
for key in feature_list:
    if key != label_name:
        feature_obj = feature_list[key]
        for feature_value in feature_obj.unique:
            for label_value in feature_list[label_name].unique:
                print(feature_value,label_value)

youth no
youth yes
middle_aged no
middle_aged yes
senior no
senior yes
high no
high yes
medium no
medium yes
low no
low yes
no no
no yes
yes no
yes yes
fair no
fair yes
excellent no
excellent yes


https://www.listendata.com/2019/07/how-to-filter-pandas-dataframe.html

solution
https://discuss.analyticsvidhya.com/t/how-to-resolve-python-error-cannot-compare-a-dtyped-int64-array-with-a-scalar-of-type-bool/73065

In [270]:
feature_obj = feature_list['age']
df[(df[feature_obj.name]=="youth") & (df[label_name]=='no')].index
## use to split data
# df.loc[(df[feature_obj.name]=="youth") & (df[label_name]=='no')]

Int64Index([0, 1, 7], dtype='int64')