In [None]:
name= "Mohammad Ghafourian"
student_number = "99106493"

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

In [2]:
# (10 Points)
def entropy(y: pd.Series):
    """
    return the entropy of input
    """
    unique, counts = np.unique(y, return_counts=True)
    probs = counts / len(y)
    entropy = -np.sum(probs * np.log2(probs))
    return entropy


def information_gain(x: pd.Series, y: pd.Series):
    """
    return the information gain of x
    """
    unique = x.unique()
    entropy_parent = entropy(y)
    weighted_sum = 0
    for value in unique:
        subset_y = y[x == value]
        entropy_child = entropy(subset_y)
        weighted_sum += (len(subset_y) / len(y)) * entropy_child
    info_gain = entropy_parent - weighted_sum
    return info_gain


def information_gains(X: pd.DataFrame, y: pd.Series):
    """
    return the information gain of all features
    """
    info_gains = {}
    for feature in X.columns:
        info_gains[feature] = information_gain(X[feature], y)
    return info_gains

In [3]:
# you can add any variable or function to class if you need.
class Node:
    def __init__(self, depth):
      # Each node in the tree is an instance of class `Node` which is capable of predicting and fitting.
        self.depth = depth
        self.best_feature = ''
        self.children = []
        self.threashold = None
        self.choice = None

    def _is_leaf(self):
        return len(self.children) == 0

    def fit(self, X_train, y_train):
        """
        learn the best_feature and create the children of this node
        - In the `fit` function this node gets features and labels from its father and using information gain decides which feature to use.
         Also based on the decided class it will create its children and call their fit function passing relevant features and labels.
        """
        ########## [Your Code] ##########
        info_gains = information_gains(X_train, y_train)
        if max(info_gains.values()) == 0:
            self.choice = y_train.value_counts().idxmax()
            return
        self.best_feature = max(info_gains, key=info_gains.get)
        unique_values = X_train[self.best_feature].unique()
        for value in unique_values:
            child = Node(self.depth + 1)
            child.best_feature = None
            child.threshold = None
            child.choice = None
            child_data = X_train[X_train[self.best_feature] == value]
            child_labels = y_train[X_train[self.best_feature] == value]
            if len(child_data) == 0:
                child.choice = y_train.value_counts().idxmax()
                self.children.append(child)
                continue
            if len(child_labels.unique()) == 1:
                child.choice = child_labels.unique()[0]
                self.children.append(child)
                continue
            self.children.append(child)
            child.fit(child_data, child_labels)

    def predict(self, X):
        """
        predicte the class of X based on this node best_feature
        - In the `predict` function this node gets features as input and based on its best_feature decides on this input. If this node is a leaf,
         it will return the decision imediatly and if it's not a leaf, it will return the prediction of its decided child.
        """
        ########## [Your Code] ##########
        if self.choice is not None:
            return self.choice
        feature_value = X[self.best_feature]
        for child in self.children:
            if feature_value == child.threshold:
                return child.predict(X)
        return self.children[0].predict(X)
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.root = None

    def fit(self, X_train, y_train):
        self.root = Node(0)
        self.root.fit(X_train, y_train)

    def predict(self, X_test):
        y_pred = []
        for _, row in X_test.iterrows():
            y_pred.append(self.root.predict(row))
        return pd.Series(y_pred)


In [4]:
df = pd.read_csv("diabetes.csv")
X = df.drop(columns=['Outcome'])
y = df['Outcome']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:

#################################
#   train dt on you train set   #
#          Your Code            #
#################################
tree = DecisionTree(max_depth=15)

# Fit the model
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

In [6]:
#############################
#   report model accuracy   #
#        Your Code          #
#############################
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6428571428571429
