<a href="https://colab.research.google.com/github/lmcanavals/ml/blob/main/decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decision Trees

In [10]:
import pandas as pd
import math

In [3]:
url="https://raw.githubusercontent.com/lmcanavals/ml/main/data/golf-dataset.csv"
df = pd.read_csv(url)
df

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


## Entropy

$$
E(s) = \sum_{i=1}^{c} (-p_i × log_2(p_i))
$$
$$
E(s, X) = \sum_{c \in X} (P(c)E(c))
$$

In [20]:
def E(df, s):
    classes = df[s].unique()
    m = len(df)
    n = len(classes)
    p = [0]*n
    for i in range(len(classes)):
        p[i] = len(df[df[s] == classes[i]]) / m
    e = 0
    for pi in p:
        e += -pi * math.log2(pi)

    return e
E(df, "Play Golf")

0.9402859586706309

In [21]:
dftemp = pd.DataFrame([{"outcome": "cara"},
                       {"outcome": "cara"},
                       {"outcome": "cara"},
                       {"outcome": "cruz"},
                       {"outcome": "cruz"},
                       {"outcome": "cruz"},])
dftemp

Unnamed: 0,outcome
0,cara
1,cara
2,cara
3,cruz
4,cruz
5,cruz


In [22]:
E(dftemp, "outcome")

1.0

In [25]:
def Ex(df, s, X, debug=False):
    classesX = df[X].unique()
    if debug: print(classesX)
    m = len(df)
    n = len(classesX)
    if debug: print(m, n)
    pX = [0]*n
    Es = [0]*n
    for i in range(len(classesX)):
        pX[i] = len(df[df[X] == classesX[i]]) / m
        if debug: print(df[df[X] == classesX[i]])
        Es[i] = E(df[df[X] == classesX[i]], s)
        if debug: print(f"{classesX[i]} -> {pX[i]} -> {Es[i]}")
    e = 0
    for i in range(n):
        e += pX[i]*Es[i]

    return e

In [26]:
Ex(df, "Play Golf", "Windy")

0.8921589282623617

In [27]:
Ex(df, "Play Golf", "Temp", debug=True)

['Hot' 'Mild' 'Cool']
14 3
     Outlook Temp Humidity  Windy Play Golf
0      Rainy  Hot     High  False        No
1      Rainy  Hot     High   True        No
2   Overcast  Hot     High  False       Yes
12  Overcast  Hot   Normal  False       Yes
Hot -> 0.2857142857142857 -> 1.0
     Outlook  Temp Humidity  Windy Play Golf
3      Sunny  Mild     High  False       Yes
7      Rainy  Mild     High  False        No
9      Sunny  Mild   Normal  False       Yes
10     Rainy  Mild   Normal   True       Yes
11  Overcast  Mild     High   True       Yes
13     Sunny  Mild     High   True        No
Mild -> 0.42857142857142855 -> 0.9182958340544896
    Outlook  Temp Humidity  Windy Play Golf
4     Sunny  Cool   Normal  False       Yes
5     Sunny  Cool   Normal   True        No
6  Overcast  Cool   Normal   True       Yes
8     Rainy  Cool   Normal  False       Yes
Cool -> 0.2857142857142857 -> 0.8112781244591328


0.9110633930116763

## Information Gain

$$
IG (s, X) = E(s) - E(s, X)
$$

## Gini index

$$
Gini = 1 - \sum_{i=1}^{c} p_i^2
$$

In [30]:
for attr in df:
    print(f"E({attr}) = {Ex(df, 'Play Golf', attr)}")

E(Outlook) = 0.6935361388961918
E(Temp) = 0.9110633930116763
E(Humidity) = 0.7884504573082896
E(Windy) = 0.8921589282623617
E(Play Golf) = 0.0


In [32]:
Ex(df, "Play Golf", "Outlook", debug=True)

['Rainy' 'Overcast' 'Sunny']
14 3
   Outlook  Temp Humidity  Windy Play Golf
0    Rainy   Hot     High  False        No
1    Rainy   Hot     High   True        No
7    Rainy  Mild     High  False        No
8    Rainy  Cool   Normal  False       Yes
10   Rainy  Mild   Normal   True       Yes
Rainy -> 0.35714285714285715 -> 0.9709505944546686
     Outlook  Temp Humidity  Windy Play Golf
2   Overcast   Hot     High  False       Yes
6   Overcast  Cool   Normal   True       Yes
11  Overcast  Mild     High   True       Yes
12  Overcast   Hot   Normal  False       Yes
Overcast -> 0.2857142857142857 -> 0.0
   Outlook  Temp Humidity  Windy Play Golf
3    Sunny  Mild     High  False       Yes
4    Sunny  Cool   Normal  False       Yes
5    Sunny  Cool   Normal   True        No
9    Sunny  Mild   Normal  False       Yes
13   Sunny  Mild     High   True        No
Sunny -> 0.35714285714285715 -> 0.9709505944546686


0.6935361388961918