In [None]:
import numpy as np
import pandas as pd

In [None]:
path = '/home/matlongo/Downloads/homework1/'
dataset = pd.read_csv(path+'dt-data.txt')
dataset['Enjoy'] = dataset[' Enjoy)'].apply(lambda enjoy: enjoy[:-1].strip())
dataset = dataset.drop(' Enjoy)', 1)
dataset['Occupied'] = dataset['(Occupied'].apply(lambda occup: occup[4:].strip())
dataset = dataset.drop('(Occupied', 1)
dataset.columns = [c.strip() for c in dataset.columns]
dataset = dataset.applymap(lambda s: s.strip())
dataset

In [None]:
def get_entropy(dataset, target):
    """
    This method returns the entropy for a target column in a given dataset. It basically gets the distribution
    of each class, and based on that it calculates the entropy.
    - dataset: Pandas DataFrame containing all the dataset.
    - target: string representing the dataset's column name for which we want to calculate the entropy.
    
    Returns a float that represents the target column's entropy, in the given dataset.
    """
    # First we check that the column is in the datataset.
    if not(target in dataset):
        raise Exception("The specified target is not present in the given dataset.")
    # First of all we get the number of occurrences for each class in our target column.
    occurrences = dataset[target].value_counts()
    # Now we obtain the probability for each class.
    p_vector = [float(v) / dataset.shape[0] for v in occurrences.values]
    # Finally, we calculate the entropy using the probabilities.
    entropy = -sum([p_i * np.log(p_i) for p_i in p_vector])
    return entropy

In [None]:
def get_info_gain_for_attr(dataset, prev_entropy, attr, target):
    """
    This function returns the Information Gain value if we were to select attribute attr to split the dataset
    in different branches.
    - dataset: Pandas DataFrame containing all the dataset.
    - prev_entropy: Entropy from the previous level, necessary to calculate the Information gain. Type float.
    - attr: Attribute's name to be used for calculating the Information gain.
    - target: String representing the dataset's column name for which we want to calculate the entropy and 
    make the predictions.
    
    It returns a float representing the Information gain for spliting the dataset using this attribute. Besides,
    it also returns a dictionary that contains the entropy for each possible value in attr, and the sub-dataset
    corresponding to that value in the column attr.
    """
    # Sanity check
    if not(attr in dataset):
        raise Exception("The specified attr is not present in the given dataset.")
    if not(target in dataset):
        raise Exception("The specified target is not present in the given dataset.")
    
    # First of all we get all the possible values. For example, Yes and No, or High, Moderate and Low.
    possible_values = dataset[attr].drop_duplicates().values
    
    # Now we are going to calculate the entropy for each possible value, and accumulate it in the total_entropy
    # variable. Besides that, we are also going to get the portion of the DataFrame that have the specified value
    # in the attr column, and remove the attr column for that sub-dataset.
    parameters = dict()
    total_entropy = 0
    for i in possible_values:
        # First we get the portion of the dataset that only has the value i.
        dataset_i = dataset.set_index(attr).loc[[i]]
        # Now we calculate the entropy for this sub-dataset.
        entropy_i = get_entropy(dataset_i, target)
        # Finally, we add this entropy to the total entropy for this attribute.
        total_entropy += float(dataset_i.shape[0])/dataset.shape[0]*entropy_i
        parameters[i] = {'dataset': dataset_i, 'entropy': entropy_i}
    return prev_entropy - total_entropy, parameters

In [None]:
def get_attr_to_split(dataset, target, prev_entropy):
    
    possible_attributes = set(dataset.columns)-{target}
    target_counts = dataset[target].value_counts() 
    # Cut condition
    print target_counts
    if len(possible_attributes)==0 or target_counts.shape[0]==1:
        node = TreeNode("Leaf")
        node.set_class(target_counts.argmax())
        return node
    
    max_gain = -1
    max_params = None
    max_attr = None
    for attr in possible_attributes:
        gain, params = get_info_gain_for_attr(dataset, prev_entropy, attr, target)
        if gain > max_gain:
            max_gain = gain
            max_params = params
            max_attr = attr
    
    node = TreeNode(max_attr)
    children = []
    for value, dic in max_params.iteritems():
        children.append(get_attr_to_split(dic['dataset'], target, dic['entropy']))
    node.set_children(children)
    return node

In [None]:
prev_entropy = get_entropy(dataset, 'Enjoy')
get_attr_to_split(dataset, 'Enjoy', prev_entropy)

In [None]:
class TreeNode:
    def __init__(self, attr_name):
        self.children = None
        self.name = attr_name
        self.class_ = None
    
    def set_children(self, children):
        self.children = children
    
    def set_class(self, class_):
        self.class_ = class_

In [None]:
tree = TreeNode()
tree.fit(dataset, target)

In [None]:
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder

In [None]:
cls = tree.DecisionTreeClassifier(criterion='entropy')
one_hot = OneHotEncoder()
pipeline = Pipeline([('encoder', one_hot), ('classifier', cls)])
encoder = LabelEncoder()

X = np.array([encoder.fit_transform(column) for column in dataset.drop('Enjoy', 1).values.T]).T

In [None]:
pipeline.fit(X, dataset['Enjoy'].values)

In [None]:
tree.export_graphviz(pipeline.named_steps.classifier, out_file='tree.dot')  

In [None]:
!dot -Tpng tree.dot -o tree.png

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
plt.figure(figsize=(12,18))
plt.axis('off')
img=mpimg.imread('tree.png')
imgplot = plt.imshow(img)
plt.show()

In [None]:
columns = dataset.drop('Enjoy', 1).columns
ax = 0
for j in range(len(columns)):
    column = columns[j]
    encoder.fit_transform(dataset[column])
    #print "x_"+str(j)+": "+column
    for i in range(len(encoder.classes_)):
        print "x_"+str(ax)+": "+column+"_"+encoder.classes_[i]
        ax += 1