In [25]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


def entropy(y):
    frequency = np.bincount(y)
    probs = frequency / len(y)
    return -np.sum([p * np.log2(p) for p in probs if p > 0])


def info_gain(X, y, feature):
    values = np.unique(X[:, feature])
    subset_entropy = 0
    for value in values:
        subset_y = y[X[:, feature] == value]
        subset_entropy += (len(subset_y) / len(y)) * entropy(subset_y)
    return entropy(y) - subset_entropy


def id3(X, y, feature_names, depth=0):
    if len(np.unique(y)) == 1:  
        return y[0]
    
    if len(feature_names) == 0: 
        return Counter(y).most_common()[0][0]
    
   
    gains = [info_gain(X, y, i) for i in range(X.shape[1])]
    
  
    best_feature = np.argmax(gains)
    best_feature_name = feature_names[best_feature]
    

    tree = {best_feature_name: {}}
    
 
    values = np.unique(X[:, best_feature])
    
   
    for value in values:
        subset_X = X[X[:, best_feature] == value]
        subset_y = y[X[:, best_feature] == value]
        
        new_feature_names = [name for i, name in enumerate(feature_names) if i != best_feature]
        
        subset_X = np.delete(subset_X, best_feature, axis=1)
        
        subtree = id3(subset_X, subset_y, new_feature_names, depth + 1)
        
        tree[best_feature_name][value] = subtree
    
    return tree


def predict(tree, sample):
    for feature, branches in tree.items():
        value = sample[feature]
        branch = branches.get(value)
        if isinstance(branch, dict):
            return predict(branch, sample)
        else:
            return branch


data = pd.read_csv("./tennis_data.csv")  


data = data.drop(columns=['Day'])

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
feature_names = list(data.columns[:-1])

label_encoder = LabelEncoder()
X_encoded = np.array([label_encoder.fit_transform(col) for col in X.T]).T
y_encoded = label_encoder.fit_transform(y)

tree = id3(X_encoded, y_encoded, feature_names)
print("Decision Tree using ID3:\n", tree)

sample = {'Outlook': 2, 'Temperature': 1, 'Humidity': 0, 'Wind': 1}  
prediction = predict(tree, sample)

print("Prediction using ID3:", label_encoder.inverse_transform([prediction])[0])

clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(X_encoded, y_encoded)

sklearn_pred = clf.predict([list(sample.values())])
print("Prediction using Scikit-learn DecisionTreeClassifier:", label_encoder.inverse_transform(sklearn_pred)[0])

y_pred_id3 = [predict(tree, dict(zip(feature_names, row))) for row in X_encoded]
y_pred_sklearn = clf.predict(X_encoded)

accuracy_id3 = accuracy_score(y_encoded, y_pred_id3)
accuracy_sklearn = accuracy_score(y_encoded, y_pred_sklearn)

print(f"Accuracy of ID3: {accuracy_id3 * 100:.2f}%")
print(f"Accuracy of Scikit-learn Decision Tree: {accuracy_sklearn * 100:.2f}%")


Decision Tree using ID3:
 {'Outlook': {0: 1, 1: {'Wind': {0: 0, 1: 1}}, 2: {'Humidity': {0: 0, 1: 1}}}}
Prediction using ID3: No
Prediction using Scikit-learn DecisionTreeClassifier: No
Accuracy of ID3: 100.00%
Accuracy of Scikit-learn Decision Tree: 100.00%
