In [1]:
import pandas as pd
import numpy as np
import math
from anytree import Node, RenderTree, find, Walker,DoubleStyle,LevelOrderIter,findall

In [2]:
class Feature:
    def __init__(self, name=None, unique=None,info=0.0,
                 df=None,gain=0.0,split_info=0.0):
        
        self.name = name
        self.unique = unique
        self.info = info
        self.gain = gain
        self.split_info = split_info
        self.gain_ratio = 0.0
        self.dataset = df

In [None]:
# class FeatureVector:
#     def __init__(feature=None,label=None,count=0):
#         self.feature = feature
#         self.label = label
#         self.count = count

In [78]:
class C45:
    def __init__(self):
        self.except_features = []
        self.feature_list={}
        self.selected_feature=[]
        self.tree = None
        self.ROOT = 'root'
        self.LABEL = 'label'
        self.DECISION = 'class'
        self.VALUE = 'value'        
        
    def read_csv(self,filename):
        df = pd.read_table(filename, sep=';', engine='python')
        self.label_name = df.columns[-1]
        self.number_of_entries = len(df)
        self.df = df
        
    def remove_feature(self,feature):
        if feature not in self.except_features:
            self.except_features.append(feature)
            self.features = [item for item in self.df.columns if item not in self.except_features]
#             self.df = self.df.drop(feature,axis=1)
        else:
            print(f'{feature} is removed!')
        
    def identify_feature(self):
        except_features = self.except_features
        df = self.df
        for col in df:
            if col not in except_features:
                feature = Feature(name=col,unique=df[col].unique())
                self.feature_list[col] = feature
#         for key in self.feature_list:
#             subdf = self.df[[key,self.label_name]]
#             self.feature_list[key].dataset = subdf
    
    def log2(self,x):
        if x == 0:
            return 0
        else:
            return math.log(x,2)
        
    def calc_info(self,featureObj,labelObj,df):
#         print(f'Processing {featureObj.name}')
        number_of_entries = len(df)
        sum_info = 0.0
        classP = 0.0
        if featureObj == labelObj:
            info = 0.0
            for label_value in labelObj.unique:
                idxs = df[(df[labelObj.name]==label_value)].index
                occur = len(idxs)
#                 print('occur:',occur)
                valueP = float(occur)/number_of_entries
                info = info - (valueP * ( self.log2(valueP) ) )
            return info
        
        for feature_value in featureObj.unique:
            info = 0.0
            idxs = df[(df[featureObj.name]==feature_value)].index
            Dj = len(idxs)
#             print(f'Occurance: {Dj}, ClassP: {classP}')
            classP = float(Dj)/number_of_entries
            for label_value in labelObj.unique:
                idxs = df[(df[featureObj.name]==feature_value) & (df[labelObj.name]==label_value)].index
                occur = len(idxs)
#                 print(f'{feature_value} {label_value} {occur}/{Dj}')
                if(Dj != 0.0):
                    valueP = float(occur)/Dj
                else:
                    valueP = 0.0
                info = info - (valueP * ( self.log2(valueP) ) )
            split_info = classP * info
            sum_info = sum_info + split_info
#         print(f'Info {featureObj.name}(D) = {sum_info}')
#         print('========================================================')
        return sum_info

    def find_feature(self,dataset):
        feature_list = {}
        for col in dataset:
            if col not in self.except_features:
                feature = Feature(name=col,unique=dataset[col].unique())
                feature_list[col] = feature
        return feature_list

    def best_feature(self,feature_list):
        bestGain = 0.0
        for key in feature_list: 
            featureObj = feature_list[key]
            if featureObj.gain_ratio > bestGain:
                bestFeature = featureObj
                bestGain = featureObj.gain_ratio
        if bestGain == 0.0:
            return -99
        return bestFeature
    
    def find_best_features(self,feature_list,df):
        labelObj = feature_list[self.label_name]
        for key in feature_list:
            featureObj = feature_list[key]
            featureObj.info = self.calc_info(featureObj,labelObj,df)
            featureObj.split_info = self.calc_info(featureObj,featureObj,df)
        labelObj = feature_list[self.label_name]
        for key in feature_list:
            featureObj = feature_list[key]
            if featureObj == labelObj:
                continue
            featureObj.gain = labelObj.info - featureObj.info
            if(featureObj.gain != 0.0):
                featureObj.gain_ratio = featureObj.gain / featureObj.split_info
            else:
                featureObj.gain_ratio = 0.0
            print(f'{featureObj.name} info={featureObj.info:.4f} gain={featureObj.gain:.4f} split_info={featureObj.split_info:.4f} gain_ratio={featureObj.gain_ratio:.4f}')
        return feature_list

    def find_best_label(self,labelObj,df):
        count = 0
        bestLabel = labelObj.unique[0]
        for value in labelObj.unique:
            idxs = df[(df[labelObj.name]==value)].index
            newCount = len(idxs)
            if newCount > count:
                bestLabel = value
        return bestLabel
    
    def split_dataset(self,name,value,dataset):
        dataset = dataset.loc[(dataset[name]==value)]
        dataset = dataset.drop(name,axis=1)
        return dataset
    
    def create_value_node(self,feature,df,currentNode):
        for value in feature.unique:
            dataset = self.split_dataset(feature.name,value,df)
            newNode = Node(value,parent=currentNode,dataset=dataset,type=self.VALUE)
            
    def create_tree(self):
        # feature_list = model.feature_list
        print('Identifing first feature...')
        feature_list = self.find_feature(self.df)
        feature_list = self.find_best_features(feature_list,self.df)
        bestFeature = self.best_feature(feature_list)
        root = Node(bestFeature.name,type=self.ROOT)
        print(f'Best feature: {bestFeature.name}')
        for value in bestFeature.unique:
            dataset = self.split_dataset(bestFeature.name,value,self.df)
        #     dataset = model.df.loc[(model.df[bestFeature.name]==value)]
        #     dataset = dataset.drop(bestFeature.name,axis=1)
            newNode = Node(value,parent=root,dataset=dataset,type=self.VALUE)
        self.tree = root
        self.display_tree()
        for node in LevelOrderIter(root):
            print('=================================')
            print(f'Node: {node.name} Type:{node.type}')
            if node != root and node.type != self.LABEL and node.type != self.DECISION:
        #         print(f'Node: {node.name}')
                print(node.dataset)
                feature_list = self.find_feature(node.dataset)
#                 print(f'Length: {len(feature_list)}')
                feature_list = self.find_best_features(feature_list,node.dataset)
                bestFeature = self.best_feature(feature_list)
                if(bestFeature != -99):
                    print(f"Best feature: {bestFeature.name}")
                    newNode = Node(bestFeature.name,parent=node,type=self.DECISION)
                    self.create_value_node(bestFeature,node.dataset,newNode)
                else:
                    labelObj = feature_list[model.label_name]
                    best_label = self.find_best_label(labelObj,node.dataset)
                    print(f'Selected label: {best_label}')
                    newNode = Node(best_label,parent=node,type=self.LABEL)
                self.display_tree()
            else:
                print(f'Skip {node.name} {node.type}')
                continue
        return root
    
    def read_testset(self,file):
        df = pd.read_table(file, sep=';', engine='python')
        label_name = df.columns[-1]
        number_of_entries = len(df)
        features = [item for item in df.columns if item != label_name]
        return label_name,number_of_entries,df,features
    
    def check_value(self,currentNode,data):
        if currentNode.type == self.LABEL:
            return currentNode
        for child in currentNode.children:
    #         print(f'Data:{str(data[currentNode.name])} type({type(str(data[currentNode.name]))}) compare child {str(child.name)} type({type(str(child.name))})')
            if(str(data[currentNode.name]) == str(child.name)):
                found = True
                return child
            
    def get_label(self,data):
        #initialize with root of tree
        currentNode = self.tree
        #start to find prediction
        while True:
    #         print(currentNode.name,currentNode.type)
    #         if it is label mean leaf
            if currentNode.type == self.LABEL:
                return currentNode.name
            #keep decending
            valueNode = self.check_value(currentNode,data)
            # if it is leaf return result
    #         if valueNode.type == model.LABEL:
    #             return valueNode.name
            #go to next node
            currentNode = valueNode.children[0]
            
    def predict_file(self,file):
        label_name,number_of_entries,dataset,features = self.read_testset(file)
        predictions = []
        print(f'Number of entries: {number_of_entries}')
        print(f'Label: {label_name}')
        print(f'Features: {features}')
        for index in dataset.index:
            data = dataset.loc[index]
            try:
                label = self.get_label(data)
            except:
                label = 'No label'
            predictions.append(label)
#             print(f'Index: {index}')
#             print(f'Actual: {data[model.label_name]}')
#             print(f'Predicted: {label}')
        return predictions,dataset

    def display_predictions(self,predictions,dataset):
#         print(dataset.columns[0],self.label_name,'Predictions')
        df = pd.DataFrame(columns=[dataset.columns[0],self.label_name,'Predictions'])
        df['Predictions'] = predictions
        df[dataset.columns[0]]=dataset[dataset.columns[0]]
        df[self.label_name]=dataset[self.label_name]
        print(df)

    def display_tree(self):
        for pre,_,node in RenderTree(self.tree,DoubleStyle):
                print("%s%s" % (pre, node.name))    
        
    def display_feature_list(self):
        for key in feature_list:
            print('======================================')
            print(f'Feature name: {feature_list[key].name}')
            print(f'Unique: {feature_list[key].unique}')
            print(f'Info Value: {feature_list[key].entropy}')
            print(f'Dataset: {feature_list[key].dataset}')
            print('======================================')
                
    def info(self):
        self.features = [item for item in self.df.columns if item not in self.except_features]
        print(f'Remove feature: {self.except_features}')
        print(f'Available feature: {self.features}')
        print(f'Number of entries: {self.number_of_entries}')

In [166]:
def check_value(currentNode,data):
    found = False
    if currentNode.type == model.LABEL:
        return currentNode
    for child in currentNode.children:
        print(f'Data:{str(data[currentNode.name])} type({type(str(data[currentNode.name]))}) compare child {str(child.name)} type({type(str(child.name))})')
        if(str(data[currentNode.name]) == str(child.name)):
            found = True
            return child
    if not found:
        return currentNode

In [171]:
def get_label(data):
    #initialize with root of tree
    currentNode = model.tree
    #start to find prediction
    while True:
        print(currentNode.name,currentNode.type)
#         if it is label mean leaf
        if currentNode.type == model.LABEL:
            print('return')
            return currentNode.name
        #keep decending
        valueNode = check_value(currentNode,data)
        if(currentNode == valueNode):
            return currentNode
        # if it is leaf return result
#         if valueNode.type == model.LABEL:
#             return valueNode.name
        #go to next node
        currentNode = valueNode.children[0]

In [178]:
labels = []
for node in LevelOrderIter(label):
    print(node.name,node.type)
    labels.append(node.name)

Temp. class
Mild value
Cool value
No label
Yes label


In [172]:
label_name,number_of_entries,dataset,features = model.read_testset('golf_test.txt')
predictions = []
# print(f'Number of entries: {number_of_entries}')
# print(f'Label: {label_name}')
# print(f'Features: {features}')
data = dataset.loc[2]
# print(data)
label = get_label(data)
print(label)
# for index in dataset.index:
#     data = dataset.loc[index]
#     label = model.get_label(data)
#     print(label)

Wind root
Data:Weak type(<class 'str'>) compare child Strong type(<class 'str'>)
Data:Weak type(<class 'str'>) compare child Weak type(<class 'str'>)
Outlook class
Data:Sunny type(<class 'str'>) compare child Overcast type(<class 'str'>)
Data:Sunny type(<class 'str'>) compare child Rain type(<class 'str'>)
Data:Sunny type(<class 'str'>) compare child Sunny type(<class 'str'>)
Temp. class
Data:Hot type(<class 'str'>) compare child Mild type(<class 'str'>)
Data:Hot type(<class 'str'>) compare child Cool type(<class 'str'>)
Node('/Wind/Weak/Outlook/Sunny/Temp.', type='class')


In [76]:
print(dataset[dataset.columns[0]].copy())

0    11
1    12
2    13
3    14
Name: RID, dtype: int64


In [82]:
predictions,dataset = model.predict_file('golf_test.txt')
model.display_predictions(predictions,dataset)

Number of entries: 3
Label: Decision
Features: ['Outlook', 'Temp.', 'Humidity', 'Wind']
    Outlook Decision Predictions
0      Rain      Yes         Yes
1  Overcast      Yes         Yes
2     Sunny       No    No label


In [None]:
model = C45()
model.read_csv('golf_train.txt')
model.remove_feature('RID')
# model.identify_feature()
model.info()
print('============================================')
model.create_tree()
model.display_tree()

In [28]:
predictions,dataset = model.predict_file('student_test.txt')
model.display_predictions(predictions,dataset)

Number of entries: 4
Label: buys_computer
Features: ['RID', 'age', 'income', 'student', 'credit_rating']
RID buys_computer Predictions
11 yes yes
12 yes yes
13 yes yes
14 no no


In [29]:
predictions,dataset = model.predict_file('student_test.txt')
for index in dataset.index:
    print(dataset[dataset.columns[0]][index],dataset[model.label_name][index],predictions[index])

Number of entries: 4
Label: buys_computer
Features: ['RID', 'age', 'income', 'student', 'credit_rating']
11 yes yes
12 yes yes
13 yes yes
14 no no


In [None]:
model.display_tree()

In [None]:
def read_testset(file):
    df = pd.read_table(file, sep=';', engine='python')
    label_name = df.columns[-1]
    number_of_entries = len(df)
    features = [item for item in df.columns if item != label_name]
    return label_name,number_of_entries,df,features

In [569]:
def check_value(currentNode,data):
    if currentNode.type == model.LABEL:
        return currentNode
    for child in currentNode.children:
#         print(f'Data:{str(data[currentNode.name])} type({type(str(data[currentNode.name]))}) compare child {str(child.name)} type({type(str(child.name))})')
        if(str(data[currentNode.name]) == str(child.name)):
            found = True
            return child

In [570]:
def get_label(data):
    #initialize with root of tree
    currentNode = model.tree
    #start to find prediction
    while True:
#         print(currentNode.name,currentNode.type)
#         if it is label mean leaf
        if currentNode.type == model.LABEL:
            return currentNode.name
        #keep decending
        valueNode = check_value(currentNode,data)
        # if it is leaf return result
#         if valueNode.type == model.LABEL:
#             return valueNode.name
        #go to next node
        currentNode = valueNode.children[0]

In [573]:
def predict_file(file):
    label_name,number_of_entries,dataset,features = read_testset(file)
    predictions = []
    print(f'Number of entries: {number_of_entries}')
    print(f'Label: {label_name}')
    print(f'Features: {features}')
    for index in dataset.index:
        data = dataset.loc[index]
        try:
            label = get_label(data)
        except:
            label = 'No label'
        predictions.append(label)
        print(f'Index: {index}')
        print(f'Actual: {data[model.label_name]}')
        print(f'Predicted: {label}')
    return predictions,dataset

In [575]:
predictions,dataset=predict_file('breast_test.txt')

Number of entries: 39
Label: Class
Features: ['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irradiat']
Index: 0
Actual: no_recurrence_events
Predicted: no_recurrence_events
Index: 1
Actual: no_recurrence_events
Predicted: recurrence_events
Index: 2
Actual: no_recurrence_events
Predicted: no_recurrence_events
Index: 3
Actual: no_recurrence_events
Predicted: no_recurrence_events
Index: 4
Actual: no_recurrence_events
Predicted: no_recurrence_events
Index: 5
Actual: no_recurrence_events
Predicted: No label
Index: 6
Actual: no_recurrence_events
Predicted: no_recurrence_events
Index: 7
Actual: no_recurrence_events
Predicted: No label
Index: 8
Actual: no_recurrence_events
Predicted: No label
Index: 9
Actual: no_recurrence_events
Predicted: no_recurrence_events
Index: 10
Actual: no_recurrence_events
Predicted: no_recurrence_events
Index: 11
Actual: no_recurrence_events
Predicted: No label
Index: 12
Actual: no_recurrence_events
Predicted: no_

In [None]:
# predictions = []
# currentNode = root
# root = model.tree
# found = False;
# for index in dataset.index:
#         data = dataset[currentNode.name][index]
#         found = False
#         for child in currentNode.children:
#             if (data == child.name):
#                 currentNode = child
#                 found = True
#                 break
#         if found:
#             currentNode = child
#         print(found)

In [None]:
def best_feature(feature_list):
    bestGain = 0.0
    for key in feature_list: 
        featureObj = feature_list[key]
        if featureObj.gain_ratio > bestGain:
            bestFeature = featureObj
            bestGain = featureObj.gain_ratio
    if bestGain == 0.0:
        return -99
    return bestFeature

In [None]:
def find_best_features(feature_list,df):
    labelObj = feature_list[model.label_name]
    for key in feature_list:
        featureObj = feature_list[key]
        featureObj.info = model.calc_info(featureObj,labelObj,df)
        featureObj.split_info = model.calc_info(featureObj,featureObj,df)
    labelObj = feature_list[model.label_name]
    for key in feature_list:
        featureObj = feature_list[key]
        if featureObj == labelObj:
            continue
        featureObj.gain = labelObj.info - featureObj.info
        featureObj.gain_ratio = featureObj.gain / featureObj.split_info
        print(featureObj.name,featureObj.info,featureObj.gain,featureObj.split_info,featureObj.gain_ratio)
    return feature_list

In [None]:
ROOT = 'root'
LABEL = 'label'
DECISION = 'class'
VALUE = 'value'

In [None]:
def find_feature(dataset):
    feature_list = {}
    for col in dataset:
        if col not in model.except_features:
            feature = Feature(name=col,unique=dataset[col].unique())
            feature_list[col] = feature
    return feature_list

In [None]:
def find_best_label(labelObj,df):
    count = 0
    bestLabel = labelObj.unique[0]
    for value in labelObj.unique:
        idxs = df[(df[labelObj.name]==value)].index
        newCount = len(idxs)
        if newCount > count:
            bestLabel = value
    return bestLabel

In [None]:
def split_dataset(name,value,dataset):
    dataset = dataset.loc[(dataset[name]==value)]
    dataset = dataset.drop(name,axis=1)
    return dataset

In [None]:
def create_value_node(feature,df,currentNode):
    for value in feature.unique:
        dataset = split_dataset(feature.name,value,df)
        newNode = Node(value,parent=currentNode,dataset=dataset,type=VALUE)

In [None]:
for pre,_,node in RenderTree(root,DoubleStyle):
        print("%s%s" % (pre, node.name))

In [None]:
# feature_list = model.feature_list
feature_list = find_feature(model.df)
feature_list = find_best_features(feature_list,model.df)
bestFeature = best_feature(feature_list)
root = Node(bestFeature.name,type=ROOT)
for value in bestFeature.unique:
    dataset = split_dataset(bestFeature.name,value,model.df)
#     dataset = model.df.loc[(model.df[bestFeature.name]==value)]
#     dataset = dataset.drop(bestFeature.name,axis=1)
    newNode = Node(value,parent=root,dataset=dataset,type=VALUE)
for node in LevelOrderIter(root):
    print(f'Node: {node.name} Type:{node.type}')
    if node != root and node.type != LABEL and node.type != DECISION:
#         print(f'Node: {node.name}')
        print(node.dataset,node.type)
        feature_list = find_feature(node.dataset)
        print(f'Length: {len(feature_list)}')
        feature_list = find_best_features(feature_list,node.dataset)
        bestFeature = best_feature(feature_list)
        if(bestFeature != -99):
            print(f"Best feature: {bestFeature.name}")
            newNode = Node(bestFeature.name,parent=node,type=DECISION)
            create_value_node(bestFeature,node.dataset,newNode)
        else:
            labelObj = feature_list[model.label_name]
            best_label = find_best_label(labelObj,node.dataset)
            print(f'Selected label: {best_label}')
            newNode = Node(best_label,parent=node,type=LABEL)
    print('=================================')

In [None]:
## developed for find_best_featrues method
# feature_list = model.feature_list
# labelObj = feature_list[model.label_name]
# selected_features = []
# for key in feature_list:
#     featureObj = feature_list[key]
#     featureObj.info = model.calc_info(featureObj,labelObj,model.df)
#     featureObj.split_info = model.calc_info(featureObj,featureObj,model.df)
# labelObj = feature_list[model.label_name]
# for key in feature_list:
#     featureObj = feature_list[key]
#     if featureObj == labelObj:
#         continue
#     featureObj.gain = labelObj.info - featureObj.info
#     featureObj.gain_ratio = featureObj.gain / featureObj.split_info
#     print(featureObj.name,featureObj.info,featureObj.gain,featureObj.split_info,featureObj.gain_ratio)
# bestFeature = best_feature(feature_list)
# selected_features.append(bestFeature)
# print('Best feature:',bestFeature.name)

In [None]:
## old find best features method
# def find_best_features(feature_list,labelObj,df):
#     for key in feature_list:
#         featureObj = feature_list[key]
#         featureObj.info = model.calc_info(featureObj,labelObj,df)
#         featureObj.split_info = model.calc_info(featureObj,featureObj,df)
#     for key in feature_list:
#         featureObj = feature_list[key]
#         if featureObj == labelObj:
#             continue
#         featureObj.gain = labelObj.info - featureObj.info
#         featureObj.gain_ratio = featureObj.gain / featureObj.split_info
#         print(featureObj.name,featureObj.info,featureObj.gain,featureObj.split_info,featureObj.gain_ratio)
#     bestFeature = best_feature(feature_list)
#     return bestFeature

In [None]:
# feature_list = model.feature_list
# feature_list = find_best_features(feature_list,model.df)
# bestFeature = best_feature(feature_list)
# print('Best feature:',bestFeature.name)

In [None]:
for pre,_,node in RenderTree(root,DoubleStyle):
    try:
        print("%s%s\n%s" % (pre, node.name,node.dataset))
    except:
        print("%s%s" % (pre, node.name))

In [None]:
for value in bestFeature.unique:
    print(value)
    dataset = model.df.loc[(model.df[bestFeature.name]==value)]
    print(dataset)

In [None]:
bestFeature.name

In [None]:
print(feature_list)

In [None]:
for key in feature_list:
    print(feature_list[key].name)

In [None]:
df = model.df
number_of_entries = len(df)
info = 0.0
for label_value in labelObj.unique:
    idxs = df[(df[labelObj.name]==label_value)].index
    occur = len(idxs)
    print('occur:',occur)
    valueP = float(occur)/number_of_entries
    info = info - (valueP * ( log2(valueP) ) )
print(info)

In [None]:
def log2(x):
    if x == 0:
        return 0
    else:
        return math.log(x,2)

In [None]:
def calc_info(featureObj,labelObj):
#     featureObj = model.feature_list['age']
#     labelObj = model.feature_list['buys_computer']
    print(f'Processing {featureObj.name}')
    df = model.df
    sum_info = 0.0
    classP = 0.0
    for feature_value in featureObj.unique:
        info = 0.0
        idxs = df[(df[featureObj.name]==feature_value)].index
        Dj = len(idxs)
        print(f'Occurance: {Dj}, ClassP: {classP}')
        classP = float(Dj)/model.number_of_entries
    #     print(f'{Dj}/{model.number_of_entries}')
        for label_value in labelObj.unique:
            idxs = df[(df[featureObj.name]==feature_value) & (df[labelObj.name]==label_value)].index
            occur = len(idxs)
            print(f'{feature_value} {label_value} {occur}/{Dj}')
            valueP = float(occur)/Dj
            info = info - (valueP * ( log2(valueP) ) )
        split_info = classP * info
        sum_info = sum_info + split_info
    print(f'Info {featureObj.name}(D) = {sum_info}')
    print('========================================================')
    return info

In [None]:
labelObj = feature_list[model.label_name]
for key in feature_list:
    featureObj = feature_list[key]
    featureObj.info = model.calc_info(featureObj,labelObj,model.df)

In [None]:
featureObj = model.feature_list['age']
labelObj = model.feature_list['buys_computer']
print(f'Processing {featureObj.name}')
sum_info = 0.0
for feature_value in featureObj.unique:
    info = 0.0
    idxs = df[(df[feature_obj.name]==feature_value)].index
    Dj = len(idxs)
    print(f'Occurance: {Dj}, ClassP: {classP}')
    classP = float(Dj)/model.number_of_entries
#     print(f'{Dj}/{model.number_of_entries}')
    for label_value in labelObj.unique:
        idxs = df[(df[feature_obj.name]==feature_value) & (df[label_name]==label_value)].index
        occur = len(idxs)
        valueP = float(occur)/Dj
        info = info - (valueP * ( log2(valueP) ) )
        print(feature_value,label_value,occur)
    split_info = classP * info
    sum_info = sum_info + split_info
print(f'Info {featureObj.name}(D) = {sum_info}')
print('========================================================')

In [None]:
df = model.df
feature_list = model.feature_list
for key in feature_list:
    print(key)
    subdf = df[[key,label_name]]
    feature_list[key].dataset = subdf

In [None]:
feaVec=[]
feature_list = model.feature_list
label_name = model.label_name
df = model.df
for key in feature_list:
    if key != label_name:
        print(f'---{key}---')
        feature_obj = feature_list[key]
        for feature_value in feature_obj.unique:
                print(feature_value)
#                 idxs = df[(df[feature_obj.name]==feature_value) & (df[label_name]==label_value)].index
#                 subdf = df.loc[(df[feature_obj.name]==feature_value) & (df[label_name]==label_value)]
                subdf = df[[feature_obj.name,label_name]]
                print(subdf)
#                 count = len(dataset)
#                 probability = float(count/model.number_of_entries)
#                 print(feature_value,label_value,probability)

Developed for read file

In [None]:
except_features = []
df = pd.read_table('student_train.txt', sep=';', engine='python')
label_name = df.columns[-1]
except_features.append('RID')
number_of_entries = len(df)

In [None]:
print(except_features)
print(number_of_features,label_name)
print(df.columns)
print(number_of_entries)

Identify Feature

In [None]:
feature_list={}
for col in df:
    if col not in except_features:
        feature = Feature(name=col,unique=df[col].unique())
        feature_list[col] = feature
#     feature_list[col] = df[col].unique()

In [None]:
for key in feature_list:
    print(feature_list[key].unique)

Develop feature vector to count the gain

In [None]:
feaVec=[]
for key in feature_list:
    if key != label_name:
        feature_obj = feature_list[key]
        for feature_value in feature_obj.unique:
            for label_value in feature_list[label_name].unique:
                print(feature_value,label_value)

https://www.listendata.com/2019/07/how-to-filter-pandas-dataframe.html

solution
https://discuss.analyticsvidhya.com/t/how-to-resolve-python-error-cannot-compare-a-dtyped-int64-array-with-a-scalar-of-type-bool/73065

In [None]:
feature_obj = feature_list['age']
df[(df[feature_obj.name]=="youth") & (df[label_name]=='no')].index
## use to split data
# df.loc[(df[feature_obj.name]=="youth") & (df[label_name]=='no')]