In [83]:
import numpy as np
import pandas as pd

def disorder(p, n):
    total = p + n
    logP = 0 if p == 0 else -(p / total) * np.log2(p / total)
    logN = 0 if n == 0 else -(n / total) * np.log2(n / total)
    return logP + logN

def totalDisorder(p1, n1, p2, n2):
    total = p1 + n1 + p2 + n2
    return (disorder(p1, n1) * ((p1 + n1) / total)) + (disorder(p2, n2) * ((p2 + n2) / total))

def getBestChar(df, target):
    bestChar = "nothing"
    bestDis = 2
    for characteristic in df.drop(target, axis=1):
        charP = df[df[characteristic] == 1]
        charN = df[df[characteristic] == 0]
        p1 = len(charP[charP[target] == 1])
        n1 = len(charP[charP[target] == 0])
        p2 = len(charN[charN[target] == 1])
        n2 = len(charN[charN[target] == 0])
        charD = totalDisorder(p1, n1, p2, n2)
        print(f'Disorder for {characteristic}: {charD}')
        if charD < bestDis:
            bestChar = characteristic
            bestDis = charD
    return bestChar

def splitDf(df, splitter, target):
    splits = []
    for col in df.columns:
        if not (col == splitter or col == target):
            splits.append(df[df[splitter] == 0].drop(splitter, axis=1))
            splits.append(df[df[splitter] == 1].drop(splitter, axis=1))
    return splits

def createTree(df, target):
    data = df.copy()
    dfs = [data]
    while(len(data.drop(target, axis=1).columns) > 0):
        # TODO: need to do this for all of the dfs in dfs, not just data
        bestChar = getBestChar(data, target)
        print(f'Best {len(df.columns) -len(data.columns) + 1} characteristic: {bestChar}')
        newDfs = []
        while(len(dfs) > 0):
            toSplit = dfs.pop()
            newDfs.extend(splitDf(toSplit, bestChar, target))
        dfs = newDfs
        data.drop(bestChar, axis=1, inplace=True)

In [88]:
totalDisorder(1, 0,0, 1)

0.0

In [89]:
fever = [0, 0, 0, 1, 1, 0, 1]
cough = [0, 0, 1, 1, 1, 1, 0]
lossTaste = [0, 0, 1, 1, 0, 1, 0]
vaccinated = [1, 1, 0, 0, 0, 1, 1]
covid = [0, 0, 1, 0, 1, 1, 0]

df = pd.DataFrame({"fever": fever, "cough": cough, "lossTaste": lossTaste, "vaccinated": vaccinated, "covid": covid})
df.head(10)

Unnamed: 0,fever,cough,lossTaste,vaccinated,covid
0,0,0,0,1,0
1,0,0,0,1,0
2,0,1,1,0,1
3,1,1,1,0,0
4,1,1,0,0,1
5,0,1,1,1,1
6,1,0,0,1,0


In [90]:
# Fever
totalDisorder(1, 2, 2, 2)

0.9649839288804954

In [91]:
# Cough
totalDisorder(3, 1, 0, 3)

0.46358749969093305

In [52]:
# Loss Taste
totalDisorder(2, 1, 3, 1)

0.8571428571428571

In [53]:
# Loss Taste
totalDisorder(1, 3, 2, 1)

0.8571428571428571

In [55]:
len(df)

7

In [56]:
# no splits
getBestChar(df, "covid")

Disorder for fever: 0.9649839288804954
Disorder for cough: 0.46358749969093305
Disorder for lossTaste: 0.8571428571428571
Disorder for vaccinated: 0.8571428571428571


'cough'

In [54]:
# split on cough
coughN = df[df["cough"] == 0].drop("cough", axis=1)
coughP = df[df["cough"] == 1].drop("cough", axis=1)

In [60]:
getBestChar(coughN, "covid")

Disorder for fever: 0.0
Disorder for lossTaste: 0.0
Disorder for vaccinated: 0.0


'fever'

In [61]:
getBestChar(coughP, "covid")

Disorder for fever: 0.5
Disorder for lossTaste: 0.6887218755408672
Disorder for vaccinated: 0.6887218755408672


'fever'

In [63]:
# split on fever
coughNfeverN = coughN[coughN["fever"] == 0].drop("fever", axis=1)
coughNfeverP = coughN[coughN["fever"] == 1].drop("fever", axis=1)

coughPfeverN = coughP[coughP["fever"] == 0].drop("fever", axis=1)
coughPfeverP = coughP[coughP["fever"] == 1].drop("fever", axis=1)

In [65]:
getBestChar(coughNfeverN, "covid")

Disorder for lossTaste: 0.0
Disorder for vaccinated: 0.0


'lossTaste'

In [66]:
getBestChar(coughNfeverP, "covid")

Disorder for lossTaste: 0.0
Disorder for vaccinated: 0.0


'lossTaste'

In [67]:
getBestChar(coughPfeverN, "covid")

Disorder for lossTaste: 0.0
Disorder for vaccinated: 0.0


'lossTaste'

In [68]:
getBestChar(coughPfeverP, "covid")

Disorder for lossTaste: 0.0
Disorder for vaccinated: 1.0


'lossTaste'

In [71]:
len(df.columns)

5

In [None]:
# split loss taste
coughNfeverN = coughN[coughN["fever"] == 0].drop("fever", axis=1)
coughNfeverP = coughN[coughN["fever"] == 1].drop("fever", axis=1)

coughNfeverN = coughP[coughP["fever"] == 0].drop("fever", axis=1)
coughNfeverP = coughP[coughP["fever"] == 1].drop("fever", axis=1)


coughNfeverN = coughN[coughN["fever"] == 0].drop("fever", axis=1)
coughNfeverP = coughN[coughN["fever"] == 1].drop("fever", axis=1)

coughPfeverN = coughP[coughP["fever"] == 0].drop("fever", axis=1)
coughPfeverP = coughP[coughP["fever"] == 1].drop("fever", axis=1)

In [82]:
createTree(df, "covid")

Disorder for fever: 0.9649839288804954
Disorder for cough: 0.46358749969093305
Disorder for lossTaste: 0.8571428571428571
Disorder for vaccinated: 0.8571428571428571
Best 1 characteristic: cough
Disorder for fever: 0.9649839288804954
Disorder for lossTaste: 0.8571428571428571
Disorder for vaccinated: 0.8571428571428571
Best 2 characteristic: lossTaste
Disorder for fever: 0.9649839288804954
Disorder for vaccinated: 0.8571428571428571
Best 3 characteristic: vaccinated
Disorder for fever: 0.9649839288804954
Best 4 characteristic: fever
