In [124]:
import numpy as np
import pandas as pd


In [125]:
"""
  Extend the functionality of a Decision Tree Classifier with numpy integration:
  Basic Idea comes from: https://github.com/random-forests/tutorials/blob/master/decision_tree.ipynb
  Numpy and Pandas Extrension from: https://github.com/Nir-J/Decision_tree_ID3/blob/master/dtree.ipynb
  Added Depth, min_sample_leafs and min_sample_Split from: https://medium.datadriveninvestor.com/easy-implementation-of-decision-tree-with-python-numpy-9ec64f05f8ae
  
  Decision Tree Regression Idea from: https://austindavidbrown.github.io/post/2019/01/regression-decision-trees-in-python/
"""

'\n  Extend the functionality of a Decision Tree Classifier with numpy integration:\n  Basic Idea comes from: https://github.com/random-forests/tutorials/blob/master/decision_tree.ipynb\n  Numpy and Pandas Extrension from: https://github.com/Nir-J/Decision_tree_ID3/blob/master/dtree.ipynb\n  Added Depth, min_sample_leafs and min_sample_Split from: https://medium.datadriveninvestor.com/easy-implementation-of-decision-tree-with-python-numpy-9ec64f05f8ae\n  \n  Decision Tree Regression Idea from: https://austindavidbrown.github.io/post/2019/01/regression-decision-trees-in-python/\n'

In [126]:
"""
  Decision Tree:
  @Input: 
    - DataFrame with size: (rows,cols)
    - the last column is the target label 
    - Label is either numeric (class number) or string type ("Apple")
    - features are of either numeric or string type
"""

'\n  Decision Tree:\n  @Input: \n    - DataFrame with size: (rows,cols)\n    - the last column is the target label \n    - Label is either numeric (class number) or string type ("Apple")\n    - features are of either numeric or string type\n'

In [127]:
class Utils:
  def __init__(self):
    pass

  def unique_vals(df,col):
    """ find unique values for a column in a dataset """
    return np.unique(df.loc[:,col])
  def class_counts(df):
    """ counts the number of each class of target label in a dataset """
    return np.unique(df.iloc[:,-1],return_counts=True)

  def is_numeric(val):
    """ test if a value is numeric """
    return isinstance(val,np.int64) or isinstance(val,np.float) or isinstance(val,np.int) or isinstance(val,int)

In [128]:
class Question:
  def __init__(self,col_num,val):
    self.col_num = col_num
    self.eval_val = val
  
  def match(self,df_row):
    """
      @input: 1-d DataFrame row
    """
    test_val = df_row[self.col_num] 
    if Utils.is_numeric(test_val):
      # we choose ">=" as split criterion for numeric values
      return test_val >= self.eval_val
    else:
      # we choose "==" as split criterion for string values
      return test_val == self.eval_val

  def partition(self,rows):
    """partition the dataset with given question
    """
    true_rows,false_rows = [],[]
    for row in rows:
      if self.match(df_row):
        true_rows.append(row)
      else:
        false_rows.append(row)
    return true_rows,false_rows

  def df_partition(self,rows):
    """@input: pd.DataFrame
    @return: pd.DataFrame
    """
    if Utils.is_numeric(self.eval_val):
      true_rows,false_rows = rows[rows.iloc[:,self.col_num]>=self.eval_val],rows[rows.iloc[:,self.col_num]<self.eval_val]
    else:
      #print(rows[self.col_num]==self.eval_val)
      true_rows,false_rows = rows[rows.iloc[:,self.col_num]==self.eval_val],rows[~(rows.iloc[:,self.col_num]==self.eval_val)]
    
    return true_rows,false_rows

  def classify(self,row):
    """@Input:pandas.Series"""
    if Utils.is_numeric(self.eval_val):
      return row.iloc[self.col_num]>=self.eval_val
    else:
      #print(rows[self.col_num]==self.eval_val)
      return row.iloc[self.col_num]==self.eval_val


  def __repr__(self):
    criterion = "=="
    if Utils.is_numeric(self.eval_val):
      criterion = ">="
    return f"Is {df.columns[self.col_num]} {criterion} {self.eval_val} "

In [129]:
class Gain:
  def gini(rows):
    """@input: pd.DataFrame
    """
    names,counts = Utils.class_counts(rows)
    impurity = 1.0
    prob_lbl = np.sum((counts/float(len(rows)))**2)
    impurity -= prob_lbl

    return impurity

  def entropy(rows):
    names,counts = Utils.class_counts(rows)
    entropy = 0
    prob_of_lbl = np.sum(-counts/float(len(rows))*np.log2(counts/float(len(rows))))
    entropy -= prob_of_lbl
    return entropy

  def info_gain_gini(left,right,current_uncertainty):
    """
      @input:
        left: true rows
        right: false rows
        current_uncertainty: gini of current rows
    """
    p = float(len(left)) / (len(left)+len(right))
    return current_uncertainty - p*Gain.gini(left) -(1-p)*Gain.gini(right)

  def info_gain_entropy(left,right,current_uncertainty):
    """
      @input:
        left: true rows
        right: false rows
        current_uncertainty: entropy of current rows
    """
    p = float(len(left))/(len(left)+len(right)) # percent of left
    return current_uncertainty - p*Gain.entropy(left) -(1-p)*Gain.entropy(right)

In [130]:
class Leaf:
    """A Leaf node classifies data.

    This holds a dictionary of class (e.g., "Apple") -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, rows): 
      counts = np.column_stack(Utils.class_counts(rows)) # == list(zip(v1,v2))
      self.predictions = {row[0]:row[1] for row in counts} # create dict
class Node:
    """A Decision Node tests a condition.

    This holds a reference to the condition, and to the two child nodes.
    """

    def __init__(self,condition,
              true_branch,
              false_branch):
        self.condition = condition
        self.true_branch = true_branch
        self.false_branch = false_branch

In [131]:
class DecisionTreeClassifierMy:
  def __init__(self,max_depth=32,min_sample_split=2):
    self.max_depth = max_depth
    self.min_sample_split = min_sample_split

  def train(self,rows):
    """ build a tree with given data"""
    self.root = self._build_tree(rows,0)
    return self.root

  def find_best_split(self,rows):
    """
    Find the best question to ask by iterating over every feature / value
    and calculating the information gain.
    
    @input:
      rows: pd.DataFrame
    """
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = Gain.gini(rows)
    n_features = len(rows.columns)-1  # number of features

    for col in range(n_features):  # for each feature

        values = np.unique(rows.iloc[:,col]) # unique values in the column
        for val in values:  
            question = Question(col, val)
            # try splitting the dataset
            true_rows, false_rows = question.df_partition(rows)

            # Skip this split if it doesn't divide the dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = Gain.info_gain_gini(true_rows, false_rows, current_uncertainty)

            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

  def _build_tree(self,rows,depth):
    """
      Recurvisely split trees and test condition

      @input: pd.DataFrame
    """
    # if number of samples is less than required, we cannot split
    if df.shape[0] < self.min_sample_split:
      return Leaf(rows)

    gain, question = self.find_best_split(rows)

    # at the recursion end: no gain obtained
    # or depth violated
    # --> return leaf
    if gain == 0 or depth == self.max_depth:
        return Leaf(rows)

    # partition recursively with best condition and pass "depth" control
    true_rows, false_rows = question.df_partition(rows)
    true_branch = self._build_tree(true_rows,depth+1) # node true
    false_branch = self._build_tree(false_rows,depth+1) # node false

    print("splitting currend node depth %d"%(depth))

    return Node(question, true_branch, false_branch)



  def predict_one(self,row):
    """ return prediction for single row """
    prediction = self._classify(row,self.root)
    return prediction["label"]

  def predict(self,rows):
    predictions = []
    for idx in rows.index:
      print(idx)
      predictions.append(self.predict_one(rows.loc[idx,:]))
    return predictions


  def _classify(self,row,node):
    # base case: at a leaf node
    if isinstance(node,Leaf):
      maxcount = 0
      maxlabel = None
      sum = 0
      
      for k,v in node.predictions.items():
        sum+=v
        
        if v>=maxcount:
          maxcount=v
          maxlabel=k
      return {"label":maxlabel,"with probability":float(maxcount/sum)}

    # climb down the tree with condition
    if isinstance(node,Node):
      condition = node.condition.classify(row)
      
      if condition:
        return self._classify(row,node.true_branch) # need add return at each branch if we want a return at the end
      else:
        return self._classify(row,node.false_branch)
    

  def print_tree(self,node, spacing=""):
    """World's most elegant tree printing function."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.condition))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    self.print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    self.print_tree(node.false_branch, spacing + "  ")

In [132]:
class Metrics:
  def accuracy(preds,labels):
    """Calculate acc given two numpy arrays in percent"""
    return np.mean(preds == labels)*100

  def confusion_matrix(preds,labels):
    """Calculate the confusion matrix
      @properties:
        Maximal 10 classes
        if class=2: TP,FP matrix
    """
    matrix = np.zeros((10,10))
    for i in range(len(preds)):
      matrix[preds[i],labels[i]]+=1
    return matrix

  def precision_and_recall(preds,labels):
    """Returns individual Precion and Recall values of each class
    
      @Properties:
        Precision: TP/TP+FP
        Recall: TP/TP+FN
        f1: 2*P*R/P+R

      @Confusion Matrix:class=2
      TP|FP
      FN|FP
        
    """
    matrix = Metrics.confusion_matrix(preds,labels)
    r,p =[],[]
    for i in range(10):
      TP = float(matrix[i,i])
      TP_FP = np.sum(matrix[i,:])
      TP_FN = np.sum(matrix[:,i])

      recall = np.nan if TP_FN == 0 else TP/TP_FN
      precision = np.nan if TP_FP == 0 else TP/TP_FP
      r.append(recall)
      p.append(precision)
    
    return (p, r)

  def precision_and_recall_score(preds,labels):
    """Calculate Macro Version of Precision and Recall"""
    p,r = Metrics.precision_and_recall(preds,labels)
    p_score,r_score = 0,0
    last_class_index = 0
    for i,val in enumerate(p):
      if ~np.isnan(val): 
        p_score += p[i]
        if i>last_class_index:
          last_class_index = i
    for i,val in enumerate(r):
      if ~np.isnan(val):
        r_score += r[i]
        if i>last_class_index:
          last_class_index = i
    class_count = last_class_index +1  
        
    return float(p_score/class_count),float(r_score/class_count)

  def f1score(preds, labels):
    """Calculates macro f1 score given two numpy arrays"""
    
    precision,recall = Metrics.precision_and_recall_score(preds,labels)
    return 2*precision*recall/(precision+recall)

In [133]:
# test_metrics
from sklearn.metrics import precision_score,recall_score,f1_score
a = [0,1,0,1,1,2]
b = [0,1,1,1,1,3]
print(Metrics.confusion_matrix(a,b))
print(Metrics.precision_and_recall(a,b))
print(Metrics.precision_and_recall_score(a,b))
print(Metrics.f1score(a,b))
print(precision_score(a,b,average='macro'))
print(recall_score(a,b,average='macro'))
print(f1_score(a,b,average='macro'))


[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 3. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
([0.5, 1.0, 0.0, nan, nan, nan, nan, nan, nan, nan], [1.0, 0.75, nan, 0.0, nan, nan, nan, nan, nan, nan])
(0.375, 0.4375)
0.40384615384615385
0.4375
0.375
0.38095238095238093


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# test main

In [134]:
# test_main

_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
] 

df = pd.DataFrame(_data,columns=["color", "diameter", "label"])
print(df.shape)
test_df = pd.DataFrame([['Green', 3, 'Apple']],columns=["color", "diameter", "label"])


print(df.loc[:,"color"])
print(["red","green"]!="green")

q1 = Question(1,2)
"""
true,false =q1.df_partition(df)
print(true)
print(false)

no_mixing = [['Apple'],['Orange']]
no = pd.DataFrame(no_mixing)
gg = Gain()
print(gg.gini(no))

print(Gain.gini(df))

dt = DecisionTreeClassifier()
mytree = dt.train(df)
print(mytree)
dt.print_tree(mytree)

print(dt.predict(test_df))
"""

(5, 3)
0     Green
1    Yellow
2       Red
3       Red
4    Yellow
Name: color, dtype: object
True


"\ntrue,false =q1.df_partition(df)\nprint(true)\nprint(false)\n\nno_mixing = [['Apple'],['Orange']]\nno = pd.DataFrame(no_mixing)\ngg = Gain()\nprint(gg.gini(no))\n\nprint(Gain.gini(df))\n\ndt = DecisionTreeClassifier()\nmytree = dt.train(df)\nprint(mytree)\ndt.print_tree(mytree)\n\nprint(dt.predict(test_df))\n"

In [135]:
# generate random samples
def generate_random_regression(n_samples=100,n_features=10):
  """ implement: y=kx+b+noise """
  intercept = 5*np.ones(n_features)
  B = 3*np.ones((n_features,1))
  X = np.zeros((n_samples,n_features))
  for i in range(0,n_samples):
    X[i,:] = np.random.multivariate_normal(np.zeros(n_features),10*np.identity(n_features)) #mean=0,cov=identity matrix
  noise = np.random.multivariate_normal(np.zeros(n_features),np.identity(n_features)) 
  print(X.shape,B.shape)
  y = intercept + X@B + noise # @ = matmul

  #train test split
  n_train = int(.7*n_samples)
  I = np.arange(0,n_samples)
  train_idx = np.random.choice(I,n_train,replace=False)
  test_idx = np.setdiff1d(I,train_idx)

  return X[train_idx,:],y[train_idx],X[test_idx,:],y[test_idx]

def generate_random_classifier(n_samples=100,n_features=10):
  """ implement: y=kx+b+noise """
  intercept = 5*np.ones(n_features)
  B = 3*np.ones((n_features,1))
  X = np.zeros((n_samples,n_features))
  for i in range(0,n_samples):
    X[i,:] = np.random.multivariate_normal(np.zeros(n_features),10*np.identity(n_features)) #mean=0,cov=identity matrix
  y = np.random.randint(low=0,high=2,size=(n_samples,1)) # @ = matmul/dot in 2-D

  #train test split
  n_train = int(.7*n_samples)
  I = np.arange(0,n_samples)
  train_idx = np.random.choice(I,n_train,replace=False)
  test_idx = np.setdiff1d(I,train_idx)

  return X[train_idx,:],y[train_idx],X[test_idx,:],y[test_idx]

xtrain,ytrain,xtest,ytest = generate_random_classifier()


In [136]:
df = pd.read_csv("winequality-white.csv",delimiter=";")
df.head()

n_samples = df.shape[0]
n_train = int(.7*n_samples)
I = np.arange(0,n_samples)
train_idx = np.random.choice(I,n_train,replace=False)
test_idx = np.setdiff1d(I,train_idx)

xtrain,ytrain,xtest,ytest = df.iloc[train_idx,:-1],df.iloc[train_idx,-1],df.iloc[test_idx,:-1],df.iloc[test_idx,-1]

In [137]:
dt = DecisionTreeClassifierMy(max_depth=10)
mytree = dt.train(df.iloc[train_idx])
# dt.print_tree(mytree)

splitting currend node depth 6
splitting currend node depth 5
splitting currend node depth 4
splitting currend node depth 3
splitting currend node depth 9
splitting currend node depth 9
splitting currend node depth 8
splitting currend node depth 9
splitting currend node depth 8
splitting currend node depth 7
splitting currend node depth 6
splitting currend node depth 9
splitting currend node depth 8
splitting currend node depth 7
splitting currend node depth 6
splitting currend node depth 5
splitting currend node depth 9
splitting currend node depth 8
splitting currend node depth 9
splitting currend node depth 9
splitting currend node depth 8
splitting currend node depth 7
splitting currend node depth 6
splitting currend node depth 7
splitting currend node depth 9
splitting currend node depth 8
splitting currend node depth 9
splitting currend node depth 8
splitting currend node depth 7
splitting currend node depth 6
splitting currend node depth 5
splitting currend node depth 4
splittin

# myClassfier

In [138]:
row = xtrain.iloc[0]
print(row)
row.iloc[0]

fixed acidity             6.90000
volatile acidity          0.54000
citric acid               0.26000
residual sugar           12.70000
chlorides                 0.04900
free sulfur dioxide      59.00000
total sulfur dioxide    195.00000
density                   0.99596
pH                        3.26000
sulphates                 0.54000
alcohol                  10.50000
Name: 4639, dtype: float64


6.9

In [143]:
ytrain

4639    6
1412    8
1857    5
1829    7
783     8
       ..
943     7
3005    6
2331    5
1041    5
3475    6
Name: quality, Length: 3428, dtype: int64

In [139]:
y_pred = dt.predict(xtrain)

4639
1412
1857
1829
783
1361
637
3738
1798
3694
4062
3702
4413
2436
3448
203
1229
223
4471
155
1475
1837
2748
4619
1330
670
361
4188
3405
2530
216
1830
2861
4104
884
3021
3176
3393
346
2348
4780
3034
3235
1122
2864
775
976
1048
3945
1924
434
2011
3462
1682
555
2485
3059
4473
4863
1303
2429
4446
1090
4267
535
610
4552
2643
1079
3958
4491
4865
1577
4235
1120
4132
2449
605
3368
2795
4089
24
3837
890
4205
1308
3861
2495
3498
3992
1678
4177
2788
1159
158
895
1716
1296
4291
4754
4278
4125
13
4087
4401
4741
4756
2303
1770
1083
4897
1790
4253
3447
4103
4574
2020
2645
2154
3273
3314
2032
3487
3894
1823
436
947
2600
2707
2300
1874
2362
3800
3009
1128
2629
3803
4628
409
3941
3269
1075
3908
862
1194
3337
4166
3427
4882
4614
4747
2672
4638
131
1012
649
4268
2700
3111
2454
488
1038
4701
828
3508
4057
2842
1803
1334
1163
2871
1454
998
3715
4451
2903
2474
4313
2891
4644
964
3177
1143
2778
2666
1648
4822
4435
1204
2024
3580
4269
1721
169
3900
1752
4090
817
3415
3233
3012
532
246
464
806
3647
3924
2708


In [None]:
y_pred

In [141]:
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier(max_depth=10)
dt.fit(xtrain,ytrain)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [142]:
y_pred_sklearn = dt.predict(xtrain)
skacc = Metrics.accuracy(y_pred_sklearn,ytrain)
myacc = Metrics.accuracy(y_pred,ytrain)

print(skacc,myacc)

74.32905484247374 74.38739789964994
