# Decision Tree Classification

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

plt.rcParams['figure.figsize'] = (12,8)

## Importing the dataset

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [8]:
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


In [11]:
from math import log

def ShannonEntropy(data):
    labelCounts = {}
    
    for featVec in data:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
            labelCounts[currentLabel] += 1
    print(labelCounts)
    shannonEntropy = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / len(data)
        print(len(data))
        shannonEntropy -= prob * log(prob, 2) # logarithm base 2
    return shannonEntropy

In [12]:
ShannonEntropy(dataset)

{'e': 1, 'y': 1, 'd': 1}
400
400
400


0.06482892142331044

In [13]:
def splitDataset(data, axis, value):
    retData = []
    
    for featVec in data:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retData.append(reducedFeatVec)
    
    return retData

In [16]:
def chooseBestFeatureToSplit(data):
    numFeatures = len(data[0]) - 1
    baseEntropy = ShannonEntropy(data)
    bestInfoGain = 0.0
    bestFeature = -1
    
    for i in range(numFeatures):
    # Create unique list of class labels
        featList = [example[i] for example in data]
        uniqueVals = set(featList)
        newEntropy = 0.0
    
    # Calculate entropy for each split
    for value in uniqueVals:
        subData = splitDataset(data, i, value)
        prob = len(subData) / float(len(data))
        newEntropy += prob * ShannonEntropy(subData)
    infoGain = baseEntropy - newEntropy
    if (infoGain > bestInfoGain):
      # Find the best information gain
        bestInfoGain = infoGain
        bestFeature = 1
 
    return bestFeature

In [17]:
def createTree(data, labels):
    classList = [example[-1] for example in data]
    
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    
    if len(data[0]) == 1:
    # A function that returns the class that occurs with the greatest frequency
    # You can write your own
        return majorityCount(classList)
  
    bestFeat = chooseBestFeatureToSplit(data)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
  
  # Get list of unique values
    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in data]
    uniqueVals = set(featValues)
  
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataset(data, bestFeat, value), subLabels)
    
    return myTree

In [23]:
dataset.columnsumns

Index(['Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [24]:
fn = ['Age', 'EstimatedSalary']

In [27]:
createTree(dataset, fn)

KeyError: 0

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [None]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

## Training the Decision Tree Classification model on the Training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'gini', random_state = 10)
classifier.fit(X_train, y_train)

Visualizing a Decision Tree
--------

In [None]:
dataset.columns

In [None]:
import graphviz
from sklearn import tree

In [None]:
fn = ['Age', 'EstimatedSalary']
cn = ['Not Purchased', 'Purchased']

In [None]:
dot_data = tree.export_graphviz(classifier, out_file=None, 
                                feature_names=fn,  
                                class_names=cn,
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph

In [None]:
graph.render("Decision_Tree_gini")

## Making the Confusion Matrix

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
rf_preds = rf.predict(X_test)

In [None]:
print(confusion_matrix(y_test, rf_preds))
print(classification_report(y_test, rf_preds))