# DS-SF-36 | 14 | Trees | Codealong | Starter Code

In [59]:
import os

import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import math

## Part A | The 2008 Democratic Primaries

(dataset adapted from http://www.stat.ucla.edu/~cocteau/primaries.csv)

In [2]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-14-2008-democrat-primaries.csv'))

In [4]:
df.columns
#all the features

Index([u'fips', u'county_name', u'state_postal', u'region', u'election_date',
       u'racetype', u'tvotes', u'clinton', u'obama', u'edwards', u'margin',
       u'winner', u'POP05_SQMI', u'popUnder30_00', u'pop65up_00',
       u'presVote04', u'kerry04', u'Bush04', u'pres04margin', u'pres04winner',
       u'pop06', u'pop00', u'hisp06', u'white06', u'black06', u'indian06',
       u'asian06', u'hawaii06', u'mixed06', u'pct_less_30k', u'pct_more_100k',
       u'pct_hs_grad', u'pct_labor_force', u'pct_homeowner', u'unempFeb07',
       u'unempFeb08', u'unempChg', u'pctUnins00', u'subForPctHomes',
       u'poverty05', u'median_hhi05', u'Catholic', u'So.Bapt.Conv',
       u'Un.Methodist', u'E.L.C.A.', u'Construction', u'Manufacturing',
       u'FinancialActivities', u'GoodsProducing', u'ServiceProviding'],
      dtype='object')

In [13]:
df.winner.value_counts(dropna = False)

clinton    1210
obama      1031
NaN          20
Name: winner, dtype: int64

In [14]:
df['c'] = (df.winner == 'obama')

In [15]:
df.c

0        True
1       False
2        True
3       False
4       False
        ...  
2256    False
2257     True
2258     True
2259     True
2260    False
Name: c, Length: 2261, dtype: bool

In [16]:
df['c'] = (df.winner == 'obama') * 1.0

In [None]:
#c is for the class we want to predict

In [17]:
df.c

0       1.0
1       0.0
2       1.0
3       0.0
4       0.0
       ... 
2256    0.0
2257    1.0
2258    1.0
2259    1.0
2260    0.0
Name: c, Length: 2261, dtype: float64

### First cut: Is a county more than 20% black?

In [25]:
#this is feature engineering
#must use this format to create or else pandas doens't like it
df['pct_black06'] = df.black06 / df.pop06 * 100

In [26]:
df.columns

Index([u'fips', u'county_name', u'state_postal', u'region', u'election_date',
       u'racetype', u'tvotes', u'clinton', u'obama', u'edwards', u'margin',
       u'winner', u'POP05_SQMI', u'popUnder30_00', u'pop65up_00',
       u'presVote04', u'kerry04', u'Bush04', u'pres04margin', u'pres04winner',
       u'pop06', u'pop00', u'hisp06', u'white06', u'black06', u'indian06',
       u'asian06', u'hawaii06', u'mixed06', u'pct_less_30k', u'pct_more_100k',
       u'pct_hs_grad', u'pct_labor_force', u'pct_homeowner', u'unempFeb07',
       u'unempFeb08', u'unempChg', u'pctUnins00', u'subForPctHomes',
       u'poverty05', u'median_hhi05', u'Catholic', u'So.Bapt.Conv',
       u'Un.Methodist', u'E.L.C.A.', u'Construction', u'Manufacturing',
       u'FinancialActivities', u'GoodsProducing', u'ServiceProviding', u'c',
       u'pct_black06'],
      dtype='object')

In [28]:
#let's create a root dataset
parent_df = df

In [31]:
left_child_df = parent_df[parent_df.pct_black06 <= 20]

In [35]:
right_child_df = parent_df[parent_df.pct_black06 > 20]

#this is cumbersome and error-prone. Better way to do this below:

In [37]:
right_child_df = parent_df.drop(left_child_df.index)

In [41]:
right_child_df.index

Int64Index([   2,    3,    5,    6,    8,   11,   12,   17,   18,   20,
            ...
            2107, 2110, 2111, 2112, 2114, 2116, 2118, 2119, 2122, 2206],
           dtype='int64', length=451)

In [43]:
left_child_df.index

Int64Index([   0,    1,    4,    7,    9,   10,   13,   14,   15,   16,
            ...
            2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260],
           dtype='int64', length=1810)

In [44]:
left_child_df

Unnamed: 0,fips,county_name,state_postal,region,election_date,...,FinancialActivities,GoodsProducing,ServiceProviding,c,pct_black06
0,1001,Autauga,AL,S,2/5/08,...,5.366229,26.776236,73.223764,1.0,17.210939
1,1003,Baldwin,AL,S,2/5/08,...,7.923872,21.282357,78.717643,0.0,9.636325
4,1009,Blount,AL,S,2/5/08,...,4.300316,34.129339,65.870661,0.0,1.545113
7,1015,Calhoun,AL,S,2/5/08,...,3.544540,24.312853,75.687147,0.0,19.643411
9,1019,Cherokee,AL,S,2/5/08,...,4.160713,37.984366,62.015634,0.0,5.461931
...,...,...,...,...,...,...,...,...,...,...,...
2256,56037,Sweetwater,WY,W,3/8/08,...,4.383050,42.670654,57.329346,0.0,0.895184
2257,56039,Teton,WY,W,3/8/08,...,5.465895,14.914934,85.085066,1.0,0.202198
2258,56041,Uinta,WY,W,3/8/08,...,5.170457,32.132498,67.867502,1.0,0.098946
2259,56043,Washakie,WY,W,3/8/08,...,6.323161,34.040552,65.959448,1.0,0.102315


In [45]:
right_child_df

Unnamed: 0,fips,county_name,state_postal,region,election_date,...,FinancialActivities,GoodsProducing,ServiceProviding,c,pct_black06
2,1005,Barbour,AL,S,2/5/08,...,3.379843,51.275520,48.724480,1.0,46.270988
3,1007,Bibb,AL,S,2/5/08,...,3.477562,42.557099,57.442901,0.0,21.902058
5,1011,Bullock,AL,S,2/5/08,...,3.523767,56.860099,43.139901,1.0,69.823950
6,1013,Butler,AL,S,2/5/08,...,3.390709,32.041909,67.958091,1.0,41.476608
8,1017,Chambers,AL,S,2/5/08,...,2.344602,42.755881,57.244119,1.0,37.841142
...,...,...,...,...,...,...,...,...,...,...,...
2116,51740,Portsmouth,VA,S,2/12/08,...,4.697560,18.229466,81.770534,1.0,52.328437
2118,51760,Richmond City,VA,S,2/12/08,...,8.382545,15.069478,84.930522,1.0,54.273170
2119,51770,Roanoke City,VA,S,2/12/08,...,6.429098,15.132792,84.867208,1.0,26.946435
2122,51800,Suffolk,VA,S,2/12/08,...,3.544776,20.760779,79.239221,1.0,41.147883


#### First cut/right node

In [None]:
# TODO

In [47]:
#pass in a df with wins and losses as coded as 1 and 0 per before
def obama_vs_clinton(df):
    obama = (df.c == 1).sum()
    clinton = (df.c == 0).sum()
    if obama > clinton:
        print 'Obama wins these counties {} to {}.'.format(obama, clinton)
    elif clinton > obama:
        print 'Clinton wins these counties {} to {}.'.format(clinton, obama)
    else:
        print 'Obama and Clinton tie in these counties {} {}.'.format(obama, clinton)

In [48]:
obama_vs_clinton(right_child_df)

Obama wins these counties 381 to 70.


### Second cut: Is high school graduation rate higher than 78%?

In [49]:
#lost track of Ivan here
parent_df = left_child_df

In [52]:
left_child_df = parent_df[parent_df.pct_hs_grad >= 78]
right_child_df = parent_df.drop(left_child_df.index)

In [53]:
obama_vs_clinton(left_child_df)

Obama and Clinton tie in these counties 0 0.


### Third cut: Is high school graduation rate higher than 87%?

In [None]:
# TODO

In [None]:
obama_vs_clinton(right_child_df)

## Part B | Building the 2008 Democratic Primaries Decision Tree by Hand

In [60]:
#back to finding entropy

class Node:

    @staticmethod
    def root(root_df):
        cs = sorted(set(root_df.c))
        return Node(cs, root_df)

    def decision(self, left_filter):
        # Collect the observations for which the decision split is true and
        # create the corresponding left node

        left_filter = left_filter(self.df)
        left_df = self.df[left_filter]
        self.left = Node(self.cs, left_df)

        # Same thing on the right side but for the observations that don't
        # satisfy the decision split (the "else")

        right_df = self.df.drop(left_df.index)
        self.right = Node(self.cs, right_df)

        # The entropy after the decision split is the weighted average of the
        # children entropy

        self.after = (self.left.samples * self.left.before
                      + self.right.samples * self.right.before) / self.samples

        # The information gain corresponds to the entropy lost between the
        # parent node (this node and the "before") and its child (the "after")

        self.information_gain = self.before - self.after

        return self

    def __init__(self, cs, df):
        self.cs = cs
        self.df = df

        # Counts of the remaining observations in the subspace per classes
        self.counts = [(self.df.c == c).sum() for c in self.cs]

        # Number of observations in the subspace
        self.samples = sum(self.counts)

        # For empty subspaces, probabilties and entropy are set to zero
        if self.samples == 0:
            self.probabilities = [.0 for count in self.counts]
            self.before = .0
        else:
            self.probabilities = [1. * count / self.samples for count in self.counts]
            self.before = - sum(map(lambda p: p * math.log(p, 2),
                                    filter(lambda p : p > .0, self.probabilities)))

    def status(self):
        print 'classes                       =', self.cs
        print 'before:'
        print "\tparent:"
        print "\t\tsamples       =", self.samples
        print "\t\tcounts        =", self.counts
        print "\t\tprobabilities =", self.probabilities
        print "\t\tentropy       =", self.before
        print 'after:'
        print "\tleft child:"
        print "\t\tsamples       =", self.left.samples
        print "\t\tcounts        =", self.left.counts
        print "\t\tprobabilities =", self.left.probabilities
        print "\t\tentropy       =", self.left.before
        print "\tright child:"
        print "\t\tsamples       =", self.right.samples
        print "\t\tcounts        =", self.right.counts
        print "\t\tprobabilities =", self.right.probabilities
        print "\t\tentropy       =", self.right.before
        print
        print 'before entropy                =', self.before
        print 'after entropy                 =', self.after
        print 'information gain              =', self.information_gain

In [61]:
df.c = df.winner

### First cut

In [62]:
#root node
node = Node.root(df)

#### Candidate #1: Is a county more than 20% black?

In [63]:
#2241 is sum of ci
#counts is counties per candidate
#pis are probablities

node.decision(lambda df: df.pct_black06 <= .2).status()

classes                       = [nan, 'clinton', 'obama']
before:
	parent:
		samples       = 2241
		counts        = [0, 1210, 1031]
		probabilities = [0.0, 0.5399375278893351, 0.46006247211066487]
		entropy       = 0.995392878882
after:
	left child:
		samples       = 177
		counts        = [0, 83, 94]
		probabilities = [0.0, 0.4689265536723164, 0.5310734463276836]
		entropy       = 0.997212189295
	right child:
		samples       = 2064
		counts        = [0, 1127, 937]
		probabilities = [0.0, 0.5460271317829457, 0.45397286821705424]
		entropy       = 0.993878647632

before entropy                = 0.995392878882
after entropy                 = 0.99414193941
information gain              = 0.00125093947178


#### Candidate #2: Is high school graduation rate higher than 78%?

In [None]:
# TODO

#### Candidate #3: Is high school graduation rate higher than 87%?

In [None]:
# TODO

### Second cut

In [None]:
# TODO

### Third cut

In [None]:
# TODO