In [1]:
import numpy as np
import pandas as pd
from numpy import log2 as log
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
eps = np.finfo(float).eps

In [2]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
df = pd.read_csv(data_url, header=None)

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [4]:
with open("agaricus-lepiota.names") as f:
    print(f.read())

1. Title: Mushroom Database

2. Sources: 
    (a) Mushroom records drawn from The Audubon Society Field Guide to North
        American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred
        A. Knopf
    (b) Donor: Jeff Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu)
    (c) Date: 27 April 1987

3. Past Usage:
    1. Schlimmer,J.S. (1987). Concept Acquisition Through Representational
       Adjustment (Technical Report 87-19).  Doctoral disseration, Department
       of Information and Computer Science, University of California, Irvine.
       --- STAGGER: asymptoted to 95% classification accuracy after reviewing
           1000 instances.
    2. Iba,W., Wogulis,J., & Langley,P. (1988).  Trading off Simplicity
       and Coverage in Incremental Concept Learning. In Proceedings of 
       the 5th International Conference on Machine Learning, 73-79.
       Ann Arbor, Michigan: Morgan Kaufmann.  
       -- approximately the same results with their HILLARY algorithm    
    3. In 

In [5]:
cols = ['target','cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']               

In [6]:
df.columns = cols
df['label']=df['target']
df.drop('target',axis=1,inplace=True)
df = df.dropna()
df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,label
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,o,o,p,o,o,p,b,c,l,e
8120,x,s,n,f,n,a,c,b,y,e,...,o,o,p,n,o,p,b,v,l,e
8121,f,s,n,f,n,a,c,b,n,e,...,o,o,p,o,o,p,b,c,l,e
8122,k,y,n,f,y,f,c,n,b,t,...,w,w,p,w,o,e,w,v,l,p


In [7]:
df.label.value_counts(normalize=True) #accuracy kullanilabilir imbalanced degil..

e    0.517971
p    0.482029
Name: label, dtype: float64

In [8]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [9]:
train_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,label
5117,k,y,c,f,n,f,w,n,w,e,...,w,n,p,w,o,e,w,v,l,e
68,x,y,y,t,a,f,c,b,w,e,...,w,w,p,w,o,p,k,n,g,e
4886,x,y,y,f,f,f,c,b,g,e,...,n,n,p,w,o,l,h,v,g,p
3323,x,f,g,f,f,f,c,b,p,e,...,n,n,p,w,o,l,h,v,d,p
1612,x,f,n,f,n,f,c,n,p,e,...,w,w,p,w,o,p,k,y,u,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2355,x,f,e,t,n,f,c,b,n,t,...,p,g,p,w,o,p,k,v,d,e
2730,x,s,g,f,n,f,w,b,h,t,...,w,w,p,w,o,e,k,s,g,e
6752,b,f,g,f,n,f,w,b,w,e,...,w,w,p,w,t,p,w,s,g,e
5810,x,y,n,f,f,f,c,n,b,t,...,p,p,p,w,o,e,w,v,d,p


In [10]:
test_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,label
3166,x,f,g,f,f,f,c,b,p,e,...,p,p,p,w,o,l,h,v,p,p
4981,f,f,g,f,f,f,c,b,g,e,...,b,p,p,w,o,l,h,y,d,p
5930,x,y,n,t,n,f,c,b,w,e,...,w,w,p,w,t,e,w,c,w,e
5279,f,f,y,f,f,f,c,b,g,e,...,n,b,p,w,o,l,h,y,p,p
5574,x,s,b,t,f,f,c,b,h,t,...,w,w,p,w,o,p,h,s,g,p
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6234,f,s,e,f,f,f,c,n,b,t,...,w,w,p,w,o,e,w,v,l,p
8014,k,s,g,f,n,f,w,b,w,e,...,w,w,p,w,t,p,w,s,g,e
6771,f,y,e,f,f,f,c,n,b,t,...,p,w,p,w,o,e,w,v,l,p
7993,k,f,g,f,n,f,w,b,g,e,...,w,w,p,w,t,p,w,s,g,e


In [11]:
## ID3 

In [12]:
def find_entropy(df):
    Class = df.keys()[-1] 
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

In [13]:
def find_entropy_attribute(df,attribute):
    Class = df.keys()[-1]   
    target_variables = df[Class].unique()  
    variables = df[attribute].unique()
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
                num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
                den = len(df[attribute][df[attribute]==variable])
                fraction = num/(den+eps)
                entropy += -fraction*log(fraction+eps)
        fraction2 = den/len(df)
        entropy2 += -fraction2*entropy
    return abs(entropy2)

In [14]:
def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [15]:
def get_subtable(df, node, value):
    return df[df[node] == value].reset_index(drop=True)

In [16]:
def buildTree(df,tree=None): 
    Class = df.keys()[-1]  
    node = find_winner(df)
    attValue = np.unique(df[node])
  
    if tree is None:                    
        tree={}
        tree[node] = {}
        
    for value in attValue:        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable[Class],return_counts=True)                        
        
        if len(counts)==1:
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable) 
                   
    return tree

In [17]:
def predict(inst,tree):
    for nodes in tree.keys():                
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
            
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                            
        
    return prediction

In [18]:
tree = buildTree(train_df)

In [19]:
preds = []
trues = test_df['label'].values

In [20]:
test_df = test_df.drop('label',axis=1)

In [21]:
for i in range(len(test_df)):
    line = pd.Series(test_df.iloc[i])
    preds.append(predict(line,tree))

In [22]:
accuracy_score(trues, preds)

1.0

In [1]:
tree

NameError: name 'tree' is not defined

In [23]:
import pprint
pprint.pprint(tree)

{'odor': {'a': 'e',
          'c': 'p',
          'f': 'p',
          'l': 'e',
          'm': 'p',
          'n': {'spore-print-color': {'b': 'e',
                                      'h': 'e',
                                      'k': 'e',
                                      'n': 'e',
                                      'o': 'e',
                                      'r': 'p',
                                      'w': {'habitat': {'d': {'gill-size': {'b': 'e',
                                                                            'n': 'p'}},
                                                        'g': 'e',
                                                        'l': {'cap-color': {'c': 'e',
                                                                            'n': 'e',
                                                                            'w': 'p',
                                                                            'y': 'p'}},
                          

In [24]:
!pip install graphviz
!pip install pydot
!pip install pydotplus



In [25]:
import pydot
import graphviz

def draw(parent_name, child_name):
    edge = pydot.Edge(parent_name, child_name)
    graph.add_edge(edge)

def visit(node, parent=None):
    for k,v in node.items():
        if isinstance(v, dict):
            # We start with the root node whose parent is None
            # we don't want to graph the None node
            if parent:
                draw(parent, k)
            visit(v, k)
        else:
            draw(parent, k)
            # drawing the label using a distinct name
            draw(k, k+'_'+v)

graph = pydot.Dot(graph_type='graph')
visit(tree)
graph.write_png('example1_graph.png')

In [34]:
y = df['label']
X = df.drop('label',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#!pip install catboost
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=10, 
    learning_rate=0.15, 
)

cat_features = list(range(0, X_train.shape[1]))

clf.fit(X_train, y_train, 
        cat_features=cat_features)

y_pred = clf.predict(X_test) 

accuracy_score(y_test, y_pred)

0:	learn: 0.3972409	total: 11.1ms	remaining: 99.9ms
1:	learn: 0.2353830	total: 34.8ms	remaining: 139ms
2:	learn: 0.1577236	total: 55.2ms	remaining: 129ms
3:	learn: 0.0992181	total: 84.2ms	remaining: 126ms
4:	learn: 0.0553405	total: 117ms	remaining: 117ms
5:	learn: 0.0343249	total: 151ms	remaining: 101ms
6:	learn: 0.0204421	total: 185ms	remaining: 79.4ms
7:	learn: 0.0129304	total: 223ms	remaining: 55.7ms
8:	learn: 0.0087015	total: 260ms	remaining: 28.9ms
9:	learn: 0.0061262	total: 297ms	remaining: 0us


1.0