### DT2

In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('income.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
data.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


### Converting to numerical values

In [19]:
col = pd.Categorical(data["workclass"])
data["workclass"] = col.codes


for name in["education", "marital-status","occupation","relationship","race","gender","native-country","income"]:
    col = pd.Categorical(data[name])
    data[name] = col.codes
    
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,4,226802,1,7,4,7,3,2,1,0,0,40,39,0
1,38,4,89814,11,9,2,5,0,4,1,0,0,50,39,0
2,28,2,336951,7,12,2,11,0,4,1,0,0,40,39,1
3,44,4,160323,15,10,2,7,0,2,1,7688,0,40,39,1
4,18,0,103497,15,10,4,0,3,4,0,0,0,30,39,0


### Split

In [20]:
private_incomes = data[data["workclass"] == 4]
public_incomes = data[data["workclass"] != 4]

print(private_incomes.shape)
print(public_incomes.shape)

(33906, 15)
(14936, 15)


### Entropy

In [23]:
import math

entropy = -(0.4 * math.log(0.4,2) + 0.6 * math.log(0.6,2))
print(entropy)

prob_0 = float(data[data["income"] == 0].shape[0]) / data.shape[0]
print(prob_0)

prob_1 = float(data[data["income"] == 1].shape[0]) / data.shape[0]
print(prob_1)

income_entropy = -(prob_0 * math.log(prob_0, 2) + prob_1 * math.log(prob_1, 2))
print(income_entropy)

0.9709505944546686
0.7607182343065395
0.23928176569346055
0.7938438393644257


In [29]:
import numpy

def calc_entropy(column):
    counts = numpy.bincount(column)
    probabilities = counts / float(len(column))
    entropy = 0
    for prob in probabilities:
        if prob > 0:
            entropy = entropy + prob*math.log(prob, 2)
    return entropy

entropy =calc_entropy([1,1,0,0,1])
print(entropy)

information_gain = entropy - ((.8 * calc_entropy([1,1,0,0])) + (.2 * calc_entropy([1])))
print(information_gain)

income_entropy = calc_entropy(data["income"])

median_age = data["age"].median()

left_split = data[data["age"] <= median_age]
right_split = data[data["age"] > median_age]

age_information_gain = income_entropy - ((float(left_split.shape[0]) / data.shape[0]) *calc_entropy(left_split["income"]) + ((float(right_split.shape[0]) / data.shape[0]) * calc_entropy(right_split["income"])))
print(age_information_gain)

-0.9709505944546686
-0.17095059445466854
-0.0461824901133866


### Best Split

In [38]:
def calc_information_gain(data, split_name, target_name):
    original_entropy = calc_entropy(data[target_name])
    column = data[split_name]
    median = column.median()
    
    left_split = data[column <= median]
    right_split = data[column > median]
    
    to_subtract = 0
    for subset in [ left_split, right_split]:
        prob = float(subset.shape[0]) / data.shape[0]
        to_subtract = to_subtract + prob*calc_entropy(subset[target_name])
        return original_entropy - to_subtract

    
    
age_information_gain = calc_information_gain(data, "age", "income")
print("age_information_gain:", age_information_gain)

columns = ["age", "workclass", "educational-num", "marital-status", "occupation", "relationship", "race", "gender", "hours-per-week", "native-country"]
information_gains = []


for col in columns:
    information_gain = calc_information_gain(data, col, "income")
    information_gains.append(information_gain)


highest_gain_index = information_gains.index(max(information_gains))
highest_gain = columns[highest_gain_index]

print("highest_gain_index:", highest_gain_index)
print("highest_gain:", highest_gain)
    

age_information_gain: -0.501964210723583
highest_gain_index: 6
highest_gain: race
