In [1]:
import pandas as pd
import sys
import json

In [25]:
def read_files(f1, f2):

    df = pd.read_csv(f1, skiprows=[1, 2], header=0)

    with open(f2) as f:
        tree = json.load(f)

    return df, tree

In [84]:
df, tree = read_files("nursery.csv", "nursery_tree.json")
df, tree

(          parents   has_nurs      form children     housing     finance  \
 0           usual     proper  complete        1  convenient  convenient   
 1           usual     proper  complete        1  convenient  convenient   
 2           usual     proper  complete        1  convenient  convenient   
 3           usual     proper  complete        1  convenient  convenient   
 4           usual     proper  complete        1  convenient  convenient   
 ...           ...        ...       ...      ...         ...         ...   
 12955  great_pret  very_crit    foster     more    critical      inconv   
 12956  great_pret  very_crit    foster     more    critical      inconv   
 12957  great_pret  very_crit    foster     more    critical      inconv   
 12958  great_pret  very_crit    foster     more    critical      inconv   
 12959  great_pret  very_crit    foster     more    critical      inconv   
 
               social       health       class  
 0            nonprob  recommended   

In [27]:
tree

{'dataset': 'nursery.csv',
 'node': {'var': 'finance',
  'edges': [{'edge': {'value': 'convenient',
     'node': {'var': 'parents',
      'edges': [{'edge': {'value': 'usual',
         'leaf': {'decision': 'not_recom', 'p': 0.74}}},
       {'edge': {'value': 'pretentious',
         'leaf': {'decision': 'priority', 'p': 0.78}}},
       {'edge': {'value': 'great_pret',
         'leaf': {'decision': 'spec_prior', 'p': 0.9}}}]}}},
   {'edge': {'value': 'inconv',
     'leaf': {'decision': 'very_recom', 'p': 0.8}}}]}}

In [None]:
def report(df, tree: dict):


In [30]:
row1 = df.iloc[0]
row1

parents           usual
has_nurs         proper
form           complete
children              1
housing      convenient
finance      convenient
social          nonprob
health      recommended
class         recommend
Name: 0, dtype: object

In [32]:
node = tree["node"]
node

{'var': 'finance',
 'edges': [{'edge': {'value': 'convenient',
    'node': {'var': 'parents',
     'edges': [{'edge': {'value': 'usual',
        'leaf': {'decision': 'not_recom', 'p': 0.74}}},
      {'edge': {'value': 'pretentious',
        'leaf': {'decision': 'priority', 'p': 0.78}}},
      {'edge': {'value': 'great_pret',
        'leaf': {'decision': 'spec_prior', 'p': 0.9}}}]}}},
  {'edge': {'value': 'inconv', 'leaf': {'decision': 'very_recom', 'p': 0.8}}}]}

In [35]:
con = row1[node["var"]]
node.values()

dict_values(['finance', [{'edge': {'value': 'convenient', 'node': {'var': 'parents', 'edges': [{'edge': {'value': 'usual', 'leaf': {'decision': 'not_recom', 'p': 0.74}}}, {'edge': {'value': 'pretentious', 'leaf': {'decision': 'priority', 'p': 0.78}}}, {'edge': {'value': 'great_pret', 'leaf': {'decision': 'spec_prior', 'p': 0.9}}}]}}}, {'edge': {'value': 'inconv', 'leaf': {'decision': 'very_recom', 'p': 0.8}}}]])

In [7]:
# tree1 = tree
# while "leaf" not in tree1.keys():
#     node = tree1["node"]
#     print(node)
#     label = row1[node['var']]
#     print()
#     print(label)
#     for edge in node["edges"]:
#         if edge['edge']['value'] == label:
#             tree1 = edge['edge']
# print(tree1)
# print(tree1['leaf']['decision'])

def search_tree(row: pd.Series, tree: dict) -> str:
    subtree = tree
    while "leaf" not in subtree.keys():
        node = subtree["node"]
        label = row[node['var']]
        for edge in node["edges"]:
            if edge['edge']['value'] == label:
                subtree = edge['edge']
                print(node['var'], label)
    return subtree['leaf']['decision']

In [4]:
def predict(df: pd.DataFrame, tree: dict) -> pd.Series:
    return df.apply(lambda row: search_tree(row, tree), axis=1)

In [82]:
search_tree(df, tree)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [90]:
predictions = predict(df, tree)
predictions

0          priority
1          priority
2         not_recom
3          priority
4          priority
            ...    
12955    spec_prior
12956     not_recom
12957      priority
12958    spec_prior
12959     not_recom
Length: 12960, dtype: object

In [86]:
df['class'].value_counts()

class
not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: count, dtype: int64

In [99]:
rep = pd.DataFrame({"obs": df['class'], "pred": predictions})
rep['correct'] = (rep["obs"] == rep["pred"])
rep

Unnamed: 0,obs,pred,correct
0,recommend,priority,False
1,priority,priority,True
2,not_recom,not_recom,True
3,recommend,priority,False
4,priority,priority,True
...,...,...,...
12955,spec_prior,spec_prior,True
12956,not_recom,not_recom,True
12957,spec_prior,priority,False
12958,spec_prior,spec_prior,True


In [103]:
num_records = len(rep)
num_correct = rep["correct"].value_counts()[True]
num_incorrect = num_records - num_correct
accuracy = (num_correct / num_records) * 100
error_rate = 100 - accuracy

2322
82.08333333333333
17.91666666666667


In [123]:
def report(obs: pd.Series, pred: pd.Series) -> str:
    rep = pd.DataFrame({"obs": obs, "pred": pred})
    rep['correct'] = (rep["obs"] == rep["pred"])

    num_records = len(rep)
    num_correct = rep["correct"].value_counts()[True]
    num_incorrect = num_records - num_correct
    accuracy = (num_correct / num_records) * 100
    error_rate = 100 - accuracy

    return(f"1. Total number of records classified: {num_records} \n" +
           f"2. Total number of records correctly classified: {num_correct} \n" +
           f"3. Total number of records incorrectly classified: {num_incorrect} \n" +
           f"4. Overall accuracy: {accuracy}. Overall Error Rate: {error_rate}")

In [124]:
print(report(df['class'], predictions))

1. Total number of records classified: 12960 
2. Total number of records correctly classified: 10638 
3. Total number of records incorrectly classified: 2322 
4. Overall accuracy: 82.08333333333333. Overall Error Rate: 17.91666666666667


In [121]:
conf_matrix = pd.crosstab(rep['obs'], rep['pred'])
conf_matrix.divide(len(rep['obs']))

pred,not_recom,priority,spec_prior
obs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
not_recom,0.333333,0.0,0.0
priority,0.0,0.297222,0.031944
recommend,0.0,0.000154,0.0
spec_prior,0.0,0.121759,0.190278
very_recom,0.0,0.025309,0.0


In [130]:
df['parents'].value_counts()


parents
usual          4320
pretentious    4320
great_pret     4320
Name: count, dtype: int64

In [2]:
tree = {"dataset": "", "node": {"var": "odor", "edges": [{"edge": {"value": "n", "leaf": {"decision": "e", "p": 1}}}, {"edge": {"value": "f", "leaf": {"decision": "p", "p": 1}}}, {"edge": {"value": "a", "leaf": {"decision": "e", "p": 1}}}, {"edge": {"value": "y", "leaf": {"decision": "p", "p": 1}}}, {"edge": {"value": "l", "leaf": {"decision": "e", "p": 1}}}, {"edge": {"value": "c", "leaf": {"decision": "p", "p": 1}}}, {"edge": {"value": "s", "leaf": {"decision": "p", "p": 1}}}, {"edge": {"value": "m", "leaf": {"decision": "p", "p": 1}}}]}}

In [10]:
tree

{'dataset': '',
 'node': {'var': 'odor',
  'edges': [{'edge': {'value': 'n', 'leaf': {'decision': 'e', 'p': 1}}},
   {'edge': {'value': 'f', 'leaf': {'decision': 'p', 'p': 1}}},
   {'edge': {'value': 'a', 'leaf': {'decision': 'e', 'p': 1}}},
   {'edge': {'value': 'y', 'leaf': {'decision': 'p', 'p': 1}}},
   {'edge': {'value': 'l', 'leaf': {'decision': 'e', 'p': 1}}},
   {'edge': {'value': 'c', 'leaf': {'decision': 'p', 'p': 1}}},
   {'edge': {'value': 's', 'leaf': {'decision': 'p', 'p': 1}}},
   {'edge': {'value': 'm', 'leaf': {'decision': 'p', 'p': 1}}}]}}

In [11]:
data['odor'].value_counts()

odor
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64

In [5]:
data = pd.read_csv('../data/agaricus-lepiota.csv', skiprows=[1, 2], header=0)
data

Unnamed: 0,Class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [8]:
predict(data, tree)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\hsu_m\anaconda3\envs\csc466\lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\hsu_m\AppData\Local\Temp\ipykernel_9164\3632223347.py", line 1, in <module>
    predict(data, tree)
  File "C:\Users\hsu_m\AppData\Local\Temp\ipykernel_9164\2324259525.py", line 2, in predict
    return df.apply(lambda row: search_tree(row, tree), axis=1)
  File "C:\Users\hsu_m\anaconda3\envs\csc466\lib\site-packages\pandas\core\frame.py", line 10037, in apply
    return op.apply().__finalize__(self, method="apply")
  File "C:\Users\hsu_m\anaconda3\envs\csc466\lib\site-packages\pandas\core\apply.py", line 837, in apply
    return self.apply_standard()
  File "C:\Users\hsu_m\anaconda3\envs\csc466\lib\site-packages\pandas\core\apply.py", line 963, in apply_standard
    results, res_index = self.apply_series_generator()
  File "C:\Users\hsu_m\anaconda3\envs\csc466\l