# Classification Tree

In [49]:
import pandas as pd
import numpy as np
import glob as glob
import json
import sklearn
from random import shuffle


In [3]:
root_path = 'features_by_valence/'
label_set = ['low', 'high']
low_files = glob.glob(root_path + label_set[0] + '/*')
high_files = glob.glob(root_path + label_set[1] + '/*')

features = {'low':[], 'high':[]}

all_features = []

for filename in low_files:
    word_histogram = json.loads(open(filename).read())
    features['low'].append(word_histogram)
    all_features.append({'dictionary': word_histogram, 'label': 'low'})
for filename in high_files:
    word_histogram = json.loads(open(filename).read())
    features['high'].append(word_histogram)
    all_features.append({'dictionary': word_histogram, 'label': 'high'})

In [4]:
n_total = int(len(all_features))
n_train = int(n_total * 0.8)
n_test = int(n_total * 0.2)

print('total: ' + str(n_total))
print('train: ' + str(n_train))
print('test: ' + str(n_test))

total: 252
train: 201
test: 50


In [40]:
# one hot encoding

key_set = {}

for feature_d in all_features:
    keys = feature_d['dictionary'].keys()
    for key in keys:
        key_set[key] = key_set.get(key, 0) + 1
key_set
# Return words that occur in over 10% of documents
key_set = dict([[key, key_set[key]] for key in key_set if key_set[key] > n_total/10.0])

In [41]:
def get_index(reference, key):
    try:
        return reference.index(key)
    except:
        return len(reference)

In [67]:
ohe = {}
i_2_label = {}

words_sorted_alphabetically = list(key_set.keys())
words_sorted_alphabetically.sort()

def one_hot_encode(reference, dictionary):
    ohe = np.zeros(len(reference) + 1)
    for key in dictionary.keys():
        pos = get_index(reference, key)
        ohe[pos] = dictionary[key]
    norm_lookup = float(np.linalg.norm(ohe))
    normalized_ohe = [float(item)/float(norm_lookup) for item in ohe]
    return normalized_ohe

In [68]:
one_hot_encode(words_sorted_alphabetically, features['high'][0])

[0.21958733622942003,
 6.403830161254595e-05,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.004674796017715854,
 0.001088651127413281,
 0.0018571107467638323,
 0.0007044213177380054,
 0.0,
 0.0,
 0.00038422980967527565,
 0.0005123064129003676,
 0.0,
 0.0,
 0.0,
 0.0003201915080627297,
 0.0005763447145129135,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0007044213177380054,
 0.0019211490483763782,
 6.403830161254595e-05,
 6.403830161254595e-05,
 0.0,
 6.403830161254595e-05,
 6.403830161254595e-05,
 6.403830161254595e-05,
 6.403830161254595e-05,
 0.0,
 0.0017290341435387405,
 0.0,
 0.0,
 0.0007684596193505513,
 0.0,
 0.0,
 0.0,
 0.06711214008994815,
 6.403830161254595e-05,
 6.403830161254595e-05,
 0.0004482681112878216,
 0.0001280766032250919,
 0.0,
 6.403830161254595e-05,
 6.403830161254595e-05,
 0.00019211490483763782,
 0.0004482681112878216,
 0.0024334554612767457,
 0.0,
 0.0,
 0.0006403830161254594,
 0.004866910922553491,
 0.0,
 0.004162489604815486,
 0.003329991683852389,
 0.0026896086677269298,
 0.0,
 0.00032019

In [83]:
def encode(y):
    if y == 'high':
        return 1
    else:
        return 0
    
def decode(y):
    if y == 1:
        return 'high'
    else:
        return 'low'

In [102]:
from sklearn.ensemble import RandomForestClassifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [103]:
clf.feature_importances_

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [104]:
clf.predict([one_hot_encode(ohe_words, features['low'][0])])[0]

0

In [105]:
def validate(af):
    classifications = {'low':0, 'high':0}
    correct_preds = 0
    all_preds = 0
    for d_item in af:
        my_dict = d_item['dictionary']
        my_label = d_item['label']
        classification = clf.predict([one_hot_encode(ohe_words, my_dict)])[0]
        classification = decode(classification)
        classifications[classification] = classifications[classification] + 1
        if classification == my_label:
            correct_preds = correct_preds + 1
        all_preds = all_preds + 1
    print(classifications)
    return float(correct_preds)/float(all_preds)

In [106]:
validate(test_features)

{'low': 34, 'high': 17}


0.49019607843137253

In [111]:
def preds_and_true(af):
    classifications = {'low':0, 'high':0}
    correct_preds = 0
    all_preds = 0
    preds = []
    true = []
    for d_item in af:
        my_dict = d_item['dictionary']
        my_label = d_item['label']
        
        classification = clf.predict([one_hot_encode(ohe_words, my_dict)])[0]
        
        preds.append(classification)
        true.append( encode(my_label) )
    return [preds, true]

In [108]:
preds, true = preds_and_true(test_features)

In [109]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score


array([[20, 12],
       [14,  5]])

In [114]:
def model():
    for i in range(10):
        shuffle(all_features)

        train_features = all_features[0:n_train]
        test_features = all_features[n_train:n_total]

        X = [one_hot_encode(ohe_words, f['dictionary']) for f in train_features]
        y = [encode(f['label']) for f in train_features]

        clf = RandomForestClassifier(max_depth=2, random_state=0)
        clf.fit(X, y)

        preds, true = preds_and_true(test_features)
        print(confusion_matrix(true, preds))
        print(accuracy_score(true, preds))

In [None]:
model()