In [10]:
import numpy as np
import pandas as pd
import typing
import math
from collections import Counter

# Strategy
* Evaluating all features in the dataset to find the one that provides the best split according to a certain criterion (entropy).
* For the beginning, we have the initial entropy. We want to make a decision to make the entropy decrease.
* Choose the attribute that decreases the entropy the most

In [30]:
def calculate_entropy(probs) -> float:
    """Calculate entropy given a list of probabilities."""
    return -sum([p * math.log2(p) for p in probs if p > 0])

In [23]:
def calculate_avg_entropy_for_column(df: pd.DataFrame, column: str, dep_var: str) -> float:
  """Calculate the average entropy for a given column."""
  total_rows = len(df)
  total_entropy = 0

  for value in df[column].unique(): # each unique value in this column
      subset = df[df[column] == value] # split into a subset based on this value
      subset_probs = [freq / len(subset) for freq in Counter(subset[dep_var]).values()]
      entropy = calculate_entropy(subset_probs)
      total_entropy += (len(subset) / total_rows) * entropy

  return total_entropy

In [36]:
def calculate_each_avg_entropy(df: pd.DataFrame, dep_var: str) -> dict:
  """Calculate average entropy for each column."""
  entropies = {}
  for column in df.columns:
      if column != dep_var:  # don't calculate for the dependent variable itself
          entropies[column] = calculate_avg_entropy_for_column(df, column, dep_var)
  return entropies

In [41]:
def decreased_entropy(init_ent: float, each_ent: dict) -> dict:
  """
  The decreased entropy after choosing a certain column to split the data.
  It can be also interpreted as the information gained after split the data based on a certain column
  """
  dec_ent = {}
  for k in each_ent:
    dec_ent[k+"_decreased"] = each_ent[k] - init_ent

  return dec_ent

# Table1.1

In [2]:
import pandas as pd

table_11 = {
    'Age': ['Young', 'Young', 'Young', 'Young', 'Young', 'Young', 'Young', 'Young',
            'Presbyopic', 'Presbyopic', 'Presbyopic', 'Presbyopic', 'Presbyopic', 'Presbyopic',
            'Presbyopic', 'Presbyopic', 'Presbyopic', 'Presbyopic', 'Presbyopic', 'Presbyopic', 'Presbyopic'],
    'Spectacle Prescription': ['Myope', 'Myope', 'Myope', 'Myope', 'Hypermetrope', 'Hypermetrope', 'Hypermetrope', 'Hypermetrope',
                               'Myope', 'Myope', 'Myope', 'Myope', 'Hypermetrope', 'Hypermetrope', 'Hypermetrope', 'Hypermetrope',
                               'Myope', 'Myope', 'Myope', 'Hypermetrope', 'Hypermetrope'],
    'Astigmatism': ['No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes',
                    'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes',
                    'No', 'No', 'Yes', 'No', 'Yes'],
    'Tear Production Rate': ['Reduced', 'Normal', 'Reduced', 'Normal', 'Reduced', 'Normal', 'Reduced', 'Normal',
                             'Reduced', 'Normal', 'Reduced', 'Normal', 'Reduced', 'Normal', 'Reduced', 'Normal',
                             'Reduced', 'Normal', 'Reduced', 'Reduced', 'Normal'],
    'Recommended Lenses': ['None', 'Soft', 'None', 'Hard', 'None', 'Soft', 'None', 'Hard',
                           'None', 'Soft', 'None', 'Hard', 'None', 'Soft', 'None', 'None',
                           'None', 'None', 'None', 'None', 'None']
}

table_11_df = pd.DataFrame(table_11)

In [3]:
table_11_df

Unnamed: 0,Age,Spectacle Prescription,Astigmatism,Tear Production Rate,Recommended Lenses
0,Young,Myope,No,Reduced,
1,Young,Myope,No,Normal,Soft
2,Young,Myope,Yes,Reduced,
3,Young,Myope,Yes,Normal,Hard
4,Young,Hypermetrope,No,Reduced,
5,Young,Hypermetrope,No,Normal,Soft
6,Young,Hypermetrope,Yes,Reduced,
7,Young,Hypermetrope,Yes,Normal,Hard
8,Presbyopic,Myope,No,Reduced,
9,Presbyopic,Myope,No,Normal,Soft


In [18]:
lens = table_11_df.iloc[:, -1].values
freq = Counter(lens)
freq

Counter({'None': 14, 'Soft': 4, 'Hard': 3})

In [33]:
probs = [f/len(lens) for f in freq.values()]

init_entropy = calculate_entropy(probs)
init_entropy

1.2467052127325733

In [37]:
each_entropy = calculate_each_avg_entropy(table_11_df, "Recommended Lenses")
each_entropy

{'Age': 1.1850683517600273,
 'Spectacle Prescription': 1.2366695164322588,
 'Astigmatism': 0.915008206805427,
 'Tear Production Rate': 0.7480717116450801}

In [39]:
decreased_entropy_dict = decreased_entropy(init_entropy, each_entropy)
decreased_entropy_dict

{'Age_decreased': -0.06163686097254595,
 'Spectacle Prescription_decreased': -0.010035696300314534,
 'Astigmatism_decreased': -0.33169700592714624,
 'Tear Production Rate_decreased': -0.49863350108749316}

In [40]:
print("""
  We can find that the attribute which decreased entropy the most is "Spectacle Prescription_decreased"
""")


  We can find that the attribute which decreased entropy the most is "Spectacle Prescription_decreased"



# Table1.3

In [4]:
table_13 = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy', 'Sunny', 'Overcast', 'Overcast', 'Rainy'],
    'Temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 86, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 91],
    'Windy': [False, True, False, False, False, True, True, False, False, False, True, True, False, True],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

table_13_df = pd.DataFrame(table_13)

In [5]:
table_13_df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,Sunny,85,85,False,No
1,Sunny,80,90,True,No
2,Overcast,83,86,False,Yes
3,Rainy,70,96,False,Yes
4,Rainy,68,80,False,Yes
5,Rainy,65,70,True,No
6,Overcast,64,65,True,Yes
7,Sunny,72,95,False,No
8,Sunny,69,70,False,Yes
9,Rainy,75,80,False,Yes


In [43]:
print("""
  The current approach may not be suitable for this data because of the continuous attributes like "Temperature" and "Humidity".
  "Temperature" and "Humidity" are continuous numeric attributes.
  Using them directly in their raw form in the decision tree algorithm can be inefficient and potentially less accurate.

  Here are some potential problems from my perspective:
  1. Infinite Split Points:
  Continuous attributes can have potentially infinite split points.
  For example, for "Temperature", you could split at 68.5, 68.6, 68.7, etc.
  Determining the optimal split point among these infinite possibilities can be computationally expensive.

  2. Overfitting:
  If we make decisions based on exact temperature values (e.g., Play = "No" if Temperature = 72),
  we might overfit to the training data.
  Overfitting makes the model memorize the training data rather than generalize from it,
  which can reduce its performance on unseen data.

  And Here are some potential solution that may solve the continuous variables:
  1. Discretization:
  One common approach is to discretize the continuous attributes.
  This involves converting a continuous attribute into categorical bins.
  For example, temperatures between 60 and 70 could be labeled as "Moderate",
  temperatures between 71 and 80 as "Warm", and so on.

  2. Dynamic Binning:
  Instead of manually choosing bin thresholds,
  the decision tree algorithm can determine the best split points based on information gain or another criterion.
  For instance, if the algorithm determines that a temperature split at 73 provides the maximum information gain,
  then it can choose that as a binary decision point.
""")


  The current approach may not be suitable for this data because of the continuous attributes like "Temperature" and "Humidity".
  "Temperature" and "Humidity" are continuous numeric attributes.
  Using them directly in their raw form in the decision tree algorithm can be inefficient and potentially less accurate. 

  Here are some potential problems from my perspective:
  1. Infinite Split Points: 
  Continuous attributes can have potentially infinite split points. 
  For example, for "Temperature", you could split at 68.5, 68.6, 68.7, etc. 
  Determining the optimal split point among these infinite possibilities can be computationally expensive.

  2. Overfitting: 
  If we make decisions based on exact temperature values (e.g., Play = "No" if Temperature = 72), 
  we might overfit to the training data. 
  Overfitting makes the model memorize the training data rather than generalize from it, 
  which can reduce its performance on unseen data.

  And Here are some potential solution t