In [4]:
import numpy as np
import matplotlib.pyplot as plt

# Step 1 - Loading in the Data

In [5]:
#Import data using np.loadtxt
clean_data = np.loadtxt('wifi_db/clean_dataset.txt')
noisy_data = np.loadtxt('wifi_db/noisy_dataset.txt')

In [6]:
#Explore clean data
print(clean_data)
print('length:', len(clean_data))
print('size:', clean_data.size)
print('shape:', clean_data.shape)

[[-64. -56. -61. ... -82. -81.   1.]
 [-68. -57. -61. ... -85. -85.   1.]
 [-63. -60. -60. ... -85. -84.   1.]
 ...
 [-62. -59. -46. ... -87. -88.   4.]
 [-62. -58. -52. ... -90. -85.   4.]
 [-59. -50. -45. ... -88. -87.   4.]]
length: 2000
size: 16000
shape: (2000, 8)


In [7]:
#Explore noisy data
print(noisy_data)
print('length:', len(noisy_data))
print('size:', noisy_data.size)
print('shape:', noisy_data.shape)

[[-59. -53. -51. ... -79. -87.   4.]
 [-66. -53. -59. ... -81. -79.   1.]
 [-41. -57. -63. ... -66. -65.   2.]
 ...
 [-57. -54. -56. ... -79. -82.   1.]
 [-56. -52. -50. ... -85. -88.   3.]
 [-46. -54. -47. ... -80. -73.   3.]]
length: 2000
size: 16000
shape: (2000, 8)


# Step 2 - Creating Decision trees

In [8]:
#Function to calculate information entropy - H(x)
def entropy(dataset):
    classes, counts = np.unique(dataset[:, -1], return_counts=True) #sorts y, and count number of each item in y
    probabilities = counts / len(classes) 
    return -np.sum(probabilities * np.log2(probabilities))

#Function to calculate information gain - IG
def information_gain(dataset, split_attribute, split_value):
    left_split = dataset[dataset[:, split_attribute] <= split_value]
    right_split = dataset[dataset[:, split_attribute] > split_value]
    p_left = len(left_split)/len(dataset)
    p_right = len(right_split)/len(dataset)
    gain = entropy(dataset) - (p_left * entropy(left_split) + p_right * entropy(right_split))
    return gain

In [26]:
def find_split(dataset):
    best_gain = 0
    best_attribute = None
    best_value = None
    
    n_features = dataset.shape[1] - 1  # Last column is the label
    for attribute in range(n_features):
        values = np.unique(dataset[:, attribute])
        for value in values:
            gain = information_gain(dataset, attribute, value)
            if gain > best_gain:
                best_gain = gain
                best_attribute = attribute
                best_value = value
    
    return best_attribute, best_value

In [27]:
def decision_tree_learning(dataset, depth=5):
    labels = dataset[:, -1]  # Last column is the label
    
    # Base case: if all labels are the same, return a leaf node
    if len(np.unique(labels)) == 1:
        return {'label': labels[0], 'depth': depth}
    
    # Otherwise, find the best split
    attribute, value = find_split(dataset)
    if attribute is None:
        # If no split is possible, return a leaf with the majority label
        unique, counts = np.unique(labels, return_counts=True)
        majority_label = unique[np.argmax(counts)]
        return {'label': majority_label, 'depth': depth}
    
    # Split the dataset
    left_split = dataset[dataset[:, attribute] <= value]
    right_split = dataset[dataset[:, attribute] > value]
    
    # Create a new decision node
    tree = {
        'attribute': attribute,
        'value': value,
        'depth': depth,
        'left': decision_tree_learning(left_split, depth + 1),
        'right': decision_tree_learning(right_split, depth + 1)
    }
    
    return tree

# Example: Load dataset and train a decision tree


# Function to print the tree (for understanding)
def print_tree(node, depth=0):
    if 'label' in node:
        print(f"{'|   ' * depth}Leaf: {node['label']}")
    else:
        print(f"{'|   ' * depth}[X{node['attribute']} <= {node['value']}]")
        print_tree(node['left'], depth + 1)
        print_tree(node['right'], depth + 1)


In [28]:
tree = decision_tree_learning(noisy_data,5)
print_tree(tree,8)

|   |   |   |   |   |   |   |   Leaf: 3.0
