In [237]:
import pandas as pd
import numpy as np
import glob as glob
import json

In [238]:
root_path = 'features_by_valence/'
label_set = ['low', 'high']
low_files = glob.glob(root_path + label_set[0] + '/*')
high_files = glob.glob(root_path + label_set[1] + '/*')

features = {'low':[], 'high':[]}

all_features = []

for filename in low_files:
    word_histogram = json.loads(open(filename).read())
    features['low'].append(word_histogram)
    all_features.append({'dictionary': word_histogram, 'label': 'low'})
for filename in high_files:
    word_histogram = json.loads(open(filename).read())
    features['high'].append(word_histogram)
    all_features.append({'dictionary': word_histogram, 'label': 'high'})

In [239]:
n_total = int(len(all_features))
n_train = int(n_total * 0.8)
n_test = int(n_total * 0.2)

print('total: ' + str(n_total))
print('train: ' + str(n_train))
print('test: ' + str(n_test))

total: 252
train: 201
test: 50


In [240]:
label_counts = {}
label_counts['high'] = len(features['high'])
label_counts['low'] = len(features['low'])
label_counts

{'high': 109, 'low': 143}

In [241]:
# one hot encoding
ohe = {}
i_2_label = {}
i_2_label[0] = 'low'
i_2_label[1] = 'high'
ohe['high'] = 1
ohe['low'] = 0

ohe

{'high': 1, 'low': 0}

In [242]:
from random import shuffle
def train_word_embeddings(af):
    wes = {}
    for d_a_l_o in af[0:n_train]:
        dictionary = d_a_l_o['dictionary']
        label = d_a_l_o['label']
        for word in dictionary:
            word_count_in_document = dictionary[word]
            wes[word] = wes.get(word, np.zeros(len(ohe)))
            pos = ohe[label]
            wes[word][pos] = wes[word][pos] + (float(word_count_in_document)/float(label_counts[label]))
    return wes

wes = train_word_embeddings(all_features) 

In [243]:
wes['certain']

array([ 229.18181818,  117.1559633 ])

In [244]:
def classify(dictionary, wes):
    outcome = np.zeros(len(ohe))
    for word in dictionary:
        lookup = wes.get( word, np.zeros(len(ohe)) )
        norm_lookup = float(np.linalg.norm(lookup))
        for i, n in enumerate(lookup):
            label_count = label_counts[i_2_label[int(i)]]
            outcome[i] = outcome[i] + n/(norm_lookup*label_count)
    pi = np.argmax(outcome)
    oi = np.argmin(outcome)
    return [i_2_label[pi], (outcome[pi]/outcome[oi])]

In [245]:
classification, score = classify(features['high'][11], wes)
print(classification, score)

low 1.44699573209


In [251]:
accurate_surety = []
inaccurate_surety = []
def validate(af, wes):
    correct_preds = 0
    all_preds = 0
    for d_a_l_o in af[n_train:n_total]:
        dictionary = d_a_l_o['dictionary']
        label = d_a_l_o['label']
        classification, score = classify(dictionary, wes)
        if classification == label:
            accurate_surety.append(score)
            correct_preds = correct_preds + 1
        else:
            inaccurate_surety.append(score)
            
        all_preds = all_preds + 1
    return [float(correct_preds)/float(all_preds), np.average(accurate_surety), np.average(inaccurate_surety)]

In [252]:
def find_acc(data):
    for i in range(2):
        shuffle(all_features)
        wes = train_word_embeddings(all_features)
        print(validate(all_features, wes))
        
find_acc(all_features)

validating:  51


  


[0.6274509803921569, nan, nan]
validating:  51
[0.5882352941176471, nan, nan]
