# Lab 3 - Decision Trees

## Agenda

- lambda functions #good for creating new features or processing fast and nameless
- pandas `map`, `apply`
- itertools and counters
- classifiers (estimators) from sklearn


In [0]:
import itertools
from collections import Counter
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

### Motivation behind using lambdas with decision trees
### Motivation for enumerating sets (refer to gini/slides)



#### Lambda Function Review

Lambda's are one line functions that are not named. Write a function to find the reciprocal of a number. 

In [0]:
#example of common function
def fun(a):
  return 1/a

In [0]:
x = lambda a:1/a

In [4]:
x(2)

0.5

#### Map and Apply in Pandas

- find the reciprocal of column 'E'
- find the reciprocal of all columns
- find the mean of all columns
- find the mean of each row

In [5]:
dummy_data = pd.DataFrame(np.random.randint(1,100,size=(100, 5)), columns=list('ABCDE'))

dummy_data.head()

Unnamed: 0,A,B,C,D,E
0,74,23,96,19,57
1,41,79,29,87,55
2,89,50,35,68,65
3,70,68,63,41,45
4,9,44,18,25,50


In [6]:
# Your code for finding the reciprocal of E
#good but uses two lines: dummy_data['1/E'] = x(dummy_data['E'])
#good but not scalable dummy_data['1/E'] = 1/dummy_data['E']

dummy_data['1/E'] = dummy_data['E'].apply(lambda E:1/E)
dummy_data.head()

Unnamed: 0,A,B,C,D,E,1/E
0,74,23,96,19,57,0.017544
1,41,79,29,87,55,0.018182
2,89,50,35,68,65,0.015385
3,70,68,63,41,45,0.022222
4,9,44,18,25,50,0.02


In [7]:
sum = lambda a,b,c:(a+b+c)
sum(1,2,3)

6

In [8]:
dummy_data_new = dummy_data.apply(lambda x: np.mean(x), axis=1)

dummy_data_new.head()

0    44.836257
1    48.503030
2    51.169231
3    47.837037
4    24.336667
dtype: float64

In [9]:
type(dummy_data_new)

pandas.core.series.Series

#### Itertools and counters

- iterable
- chain
- combinations

In [10]:
i = iter(range(7)) 

print(next(i))
print(next(i))

i

0
1


<range_iterator at 0x7f9bb335cfc0>

In [11]:
i = iter(range(7)) 
for combo in itertools.combinations(i, 3):
    print(combo)

(0, 1, 2)
(0, 1, 3)
(0, 1, 4)
(0, 1, 5)
(0, 1, 6)
(0, 2, 3)
(0, 2, 4)
(0, 2, 5)
(0, 2, 6)
(0, 3, 4)
(0, 3, 5)
(0, 3, 6)
(0, 4, 5)
(0, 4, 6)
(0, 5, 6)
(1, 2, 3)
(1, 2, 4)
(1, 2, 5)
(1, 2, 6)
(1, 3, 4)
(1, 3, 5)
(1, 3, 6)
(1, 4, 5)
(1, 4, 6)
(1, 5, 6)
(2, 3, 4)
(2, 3, 5)
(2, 3, 6)
(2, 4, 5)
(2, 4, 6)
(2, 5, 6)
(3, 4, 5)
(3, 4, 6)
(3, 5, 6)
(4, 5, 6)


In [12]:
z = 1, 5, 6
list(itertools.combinations(z,2))

[(1, 5), (1, 6), (5, 6)]

In [13]:
i = iter(range(7)) 
list(itertools.combinations(i, 3)) #take i and give me aall combinations of 3 of iand from previous code y only have left i 2 to 6

[(0, 1, 2),
 (0, 1, 3),
 (0, 1, 4),
 (0, 1, 5),
 (0, 1, 6),
 (0, 2, 3),
 (0, 2, 4),
 (0, 2, 5),
 (0, 2, 6),
 (0, 3, 4),
 (0, 3, 5),
 (0, 3, 6),
 (0, 4, 5),
 (0, 4, 6),
 (0, 5, 6),
 (1, 2, 3),
 (1, 2, 4),
 (1, 2, 5),
 (1, 2, 6),
 (1, 3, 4),
 (1, 3, 5),
 (1, 3, 6),
 (1, 4, 5),
 (1, 4, 6),
 (1, 5, 6),
 (2, 3, 4),
 (2, 3, 5),
 (2, 3, 6),
 (2, 4, 5),
 (2, 4, 6),
 (2, 5, 6),
 (3, 4, 5),
 (3, 4, 6),
 (3, 5, 6),
 (4, 5, 6)]

Write a function to get the power set of a list. You can assume that it will fit in memory.

In [14]:
s="abcd"
list(itertools.combinations(s, r) for r in range(len(s) + 1))

[<itertools.combinations at 0x7f9bb2812ea8>,
 <itertools.combinations at 0x7f9bb2812f48>,
 <itertools.combinations at 0x7f9bb283a048>,
 <itertools.combinations at 0x7f9bb283a0e8>,
 <itertools.combinations at 0x7f9bb283a188>]

In [15]:
s="abcde"
list(itertools.chain.from_iterable(
      itertools.combinations(s, r) for r in range(len(s) + 1)))

[(),
 ('a',),
 ('b',),
 ('c',),
 ('d',),
 ('e',),
 ('a', 'b'),
 ('a', 'c'),
 ('a', 'd'),
 ('a', 'e'),
 ('b', 'c'),
 ('b', 'd'),
 ('b', 'e'),
 ('c', 'd'),
 ('c', 'e'),
 ('d', 'e'),
 ('a', 'b', 'c'),
 ('a', 'b', 'd'),
 ('a', 'b', 'e'),
 ('a', 'c', 'd'),
 ('a', 'c', 'e'),
 ('a', 'd', 'e'),
 ('b', 'c', 'd'),
 ('b', 'c', 'e'),
 ('b', 'd', 'e'),
 ('c', 'd', 'e'),
 ('a', 'b', 'c', 'd'),
 ('a', 'b', 'c', 'e'),
 ('a', 'b', 'd', 'e'),
 ('a', 'c', 'd', 'e'),
 ('b', 'c', 'd', 'e'),
 ('a', 'b', 'c', 'd', 'e')]

In [0]:
def power_set(s=["blue", "brown", "hazel", "green", "other"]):   
  # Your code
  return list(itertools.chain.from_iterable(
      itertools.combinations(s, r) for r in range(len(s) + 16)))

val = power_set()

In [8]:
val

[(),
 ('blue',),
 ('brown',),
 ('hazel',),
 ('green',),
 ('other',),
 ('blue', 'brown'),
 ('blue', 'hazel'),
 ('blue', 'green'),
 ('blue', 'other'),
 ('brown', 'hazel'),
 ('brown', 'green'),
 ('brown', 'other'),
 ('hazel', 'green'),
 ('hazel', 'other'),
 ('green', 'other'),
 ('blue', 'brown', 'hazel'),
 ('blue', 'brown', 'green'),
 ('blue', 'brown', 'other'),
 ('blue', 'hazel', 'green'),
 ('blue', 'hazel', 'other'),
 ('blue', 'green', 'other'),
 ('brown', 'hazel', 'green'),
 ('brown', 'hazel', 'other'),
 ('brown', 'green', 'other'),
 ('hazel', 'green', 'other'),
 ('blue', 'brown', 'hazel', 'green'),
 ('blue', 'brown', 'hazel', 'other'),
 ('blue', 'brown', 'green', 'other'),
 ('blue', 'hazel', 'green', 'other'),
 ('brown', 'hazel', 'green', 'other'),
 ('blue', 'brown', 'hazel', 'green', 'other')]

In [21]:
for value in val:
  dfresult = df[df['eyecolor'] == value]
  dfresult2 = df[df['eyecolor'] != value]
  ginid1 
  ginid2
  gini = 1-ginid1-ginid2
  
  
  print(value)

()
('red',)
('green',)
('blue',)
('red', 'green')
('red', 'blue')
('green', 'blue')
('red', 'green', 'blue')


#### Basic classification using sklearn


In [0]:
dummy_data["target"] = np.random.randint(0,2,size=(100,)) #creating target variable randomly 
dummy_data.head()

Unnamed: 0,A,B,C,D,E,1/E,target
0,1,50,18,96,87,0.011494,0
1,99,98,59,69,18,0.055556,1
2,39,41,86,50,33,0.030303,1
3,73,10,90,39,26,0.038462,0
4,59,83,61,17,12,0.083333,1


In [0]:
X = dummy_data[["A","B","C","D"]]
y = dummy_data.target

clf = DecisionTreeClassifier(max_depth=4) #stopping criteria is 4 and it can stop before, gives algorithm

clf = clf.fit(X, y) #apply algorithm to data 
y_pred = clf.predict(X) #predict X

print(accuracy_score(y, y_pred))

0.8


In [0]:
print(y_pred)

[0 1 1 0 1 0 0 1 0 1 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0
 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 1 0
 0 0 0 1 0 1 1 1 0 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1]
