In [1]:
%config IPCompleter.greedy=True

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('./house-votes-84.data')

In [4]:
train=df.sample(frac=0.6,random_state=200) #random state is a seed value
test=df.drop(train.index)

print(len(test),len(train))

174 261


In [5]:
train.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
176,republican,n,n,y,y,n,n,y,y,y,y,n,n,n,y,y,y
141,republican,n,n,n,y,y,y,y,y,y,y,n,y,y,y,?,y
59,republican,n,y,n,y,y,y,n,n,n,y,n,y,y,y,n,?
414,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,n,y
31,democrat,y,y,y,n,n,n,y,y,y,n,y,n,n,n,y,?


In [6]:
class_codes = {"republican": 1, "democrat": 2}
class_col = train.columns[0]

In [7]:
for class_value, class_code in class_codes.items():
    train.loc[train[class_col] == class_value, class_col] = class_code

In [8]:
feature_codes = {"y": 1, "n": 2, "?": 3}

In [9]:
for feature in range(1, len(train.columns)):
    feature_col = train.columns[feature]
    for feature_value, feature_code in feature_codes.items():
        train.loc[train[feature_col] == feature_value, feature_col] = feature_code

In [10]:
train.head()

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
176,1,2,2,1,1,2,2,1,1,1,1,2,2,2,1,1,1
141,1,2,2,2,1,1,1,1,1,1,1,2,1,1,1,3,1
59,1,2,1,2,1,1,1,2,2,2,1,2,1,1,1,2,3
414,2,1,1,1,2,2,2,1,1,1,2,2,2,2,2,2,1
31,2,1,1,1,2,2,2,1,1,1,2,1,2,2,2,1,3


In [11]:
from math import log2 as log
from functools import reduce

def E(republican, democrat):
    if 0 in (republican, democrat):
        return 0
    total = republican + democrat
    return reduce(lambda x,y : x - log(y)*y, [0, republican/total, democrat/total])

In [12]:
E_y = E(len(train[train[class_col]==1]), len(train[train[class_col]==2]))
print(E_y)

0.9575534837147482


In [13]:
def IG(E_y, dataframe, feature_col):
    total = len(dataframe)
    class_col = dataframe.columns[0]
    
    arr = list()
    
    for value in dataframe[feature_col].unique():
        p = len(dataframe[dataframe[feature_col]==value]) / total
        E_feature = E(len(dataframe[(dataframe[class_col]==1) & (dataframe[feature_col]==value)]),
                      len(dataframe[(dataframe[class_col]==2) & (dataframe[feature_col]==value)]))
        arr.append(p * E_feature)
    
    return E_y - sum(arr)
    

In [14]:
IGf = dict()
maxIG=(train.columns[1], 0)

for feature in train.columns[1:]:
    IGf[feature] = IG(E_y, train, feature)
    
    if IGf[feature] > maxIG[1]:
        maxIG = (feature, IGf[feature])
        
print(IGf)
print(maxIG)

{'handicapped-infants': 0.1176416037724366, 'water-project-cost-sharing': 0.0012126375879761842, 'adoption-of-the-budget-resolution': 0.43955210510622766, 'physician-fee-freeze': 0.7700317560269849, 'el-salvador-aid': 0.3729265099558966, 'religious-groups-in-schools': 0.12586911151567315, 'anti-satellite-test-ban': 0.14904155399272356, 'aid-to-nicaraguan-contras': 0.29199843197150277, 'mx-missile': 0.25557738563293275, 'immigration': 0.029326212431275755, 'synfuels-corporation-cutback': 0.10537232240302397, 'education-spending': 0.37709559571033646, 'superfund-right-to-sue': 0.1706384740507162, 'crime': 0.3742291625024132, 'duty-free-exports': 0.21077471226405964, 'export-administration-act-south-africa': 0.11168822358997943}
('physician-fee-freeze', 0.7700317560269849)


In [15]:
decision_tree = (maxIG[0], dict())

for value in train[maxIG[0]].unique():
    vc = train[train[maxIG[0]]==value][class_col].value_counts()

    prior = (1, 0) 
    for row in vc.iteritems():
        if row[1] > prior[1]:
            prior = (row[0], row[1])
    
    prior_class = [ key for (key, val) in class_codes.items() if val == prior[0]][0]
    prior_value = [ key for (key, val) in feature_codes.items() if val == value][0]
    print("{}={} --> {}".format(maxIG[0], prior_value, prior_class))
    
    decision_tree[1][prior_value] = prior_class
    
print(decision_tree)

physician-fee-freeze=y --> republican
physician-fee-freeze=n --> democrat
physician-fee-freeze=? --> democrat
('physician-fee-freeze', {'y': 'republican', 'n': 'democrat', '?': 'democrat'})


In [16]:
predicted_col = "predicted_class"
for row in test.iterrows():
    test.loc[row[0], predicted_col] = decision_tree[1][row[1][decision_tree[0]]]

test

Unnamed: 0,Class Name,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa,predicted_class
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y,republican
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?,republican
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n,democrat
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y,democrat
6,democrat,n,y,n,y,y,y,n,n,n,n,n,n,?,y,y,y,republican
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,democrat,y,n,y,n,?,n,y,y,y,y,n,y,n,?,y,y,democrat
430,republican,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y,republican
431,democrat,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y,democrat
432,republican,n,?,n,y,y,y,n,n,n,n,y,y,y,y,n,y,republican


In [17]:
TP = len(test[(test[predicted_col] == "republican") & (test[class_col] == "republican")])
TN = len(test[(test[predicted_col] == "democrat") & (test[class_col] == "democrat")])
FP = len(test[(test[predicted_col] == "republican") & (test[class_col] == "democrat")])
FN = len(test[(test[predicted_col] == "democrat") & (test[class_col] == "republican")])

In [18]:
print(f"TP={TP}, TN={TN}, FP={FP}, FN={FN}")

recall = TP / (TP + FN)
precision = TP / (TP + FP)
accuracy = (TP + TN) / (TP + TN + FP + FN)

print(f"recall={recall}, precision={precision}, accuracy={accuracy}")

TP=66, TN=98, FP=7, FN=3
recall=0.9565217391304348, precision=0.9041095890410958, accuracy=0.9425287356321839
