In [254]:
import pandas as pd
import numpy as np
import glob as glob
import json
from random import shuffle

In [255]:
root_path = 'features_by_valence/'
label_set = ['low', 'high']
low_files = glob.glob(root_path + label_set[0] + '/*')
high_files = glob.glob(root_path + label_set[1] + '/*')

features = {'low':[], 'high':[]}

all_features = []

for filename in low_files:
    word_histogram = json.loads(open(filename).read())
    features['low'].append(word_histogram)
    all_features.append({'dictionary': word_histogram, 'label': 'low'})
for filename in high_files:
    word_histogram = json.loads(open(filename).read())
    features['high'].append(word_histogram)
    all_features.append({'dictionary': word_histogram, 'label': 'high'})

In [256]:
n_total = int(len(all_features))
n_train = int(n_total * 0.8)
n_test = int(n_total * 0.2)

print('total: ' + str(n_total))
print('train: ' + str(n_train))
print('test: ' + str(n_test))

total: 252
train: 201
test: 50


In [257]:
label_counts = {}
label_counts['high'] = len(features['high'])
label_counts['low'] = len(features['low'])
label_counts

{'high': 109, 'low': 143}

In [258]:
# one hot encoding
ohe = {}
i_2_label = {}
i_2_label[0] = 'low'
i_2_label[1] = 'high'
ohe['high'] = 1
ohe['low'] = 0

ohe

{'high': 1, 'low': 0}

In [369]:
def train_word_embeddings(af):
    wes = {}
    for d_a_l_o in af:
        dictionary = d_a_l_o['dictionary']
        label = d_a_l_o['label']
        for word in dictionary:
            word_count_in_document = dictionary[word]
            wes[word] = wes.get(word, np.zeros(len(ohe)))
            pos = ohe[label]
            wes[word][pos] = wes[word][pos] + (float(word_count_in_document)/float(label_counts[label]))
    return wes

In [409]:
def classify(dictionary, wes, label_counts):
    outcome = np.zeros(len(ohe))
    for word in dictionary:
        word_count_in_document = dictionary[word]
        lookup = wes.get( word, np.zeros(len(ohe)) )
        norm_lookup = float(np.linalg.norm(lookup))
        for i, n in enumerate(lookup):
            label_count = label_counts[i_2_label[int(i)]]
            outcome[i] = outcome[i] + (n*word_count_in_document)/(norm_lookup*norm_lookup)
    pi = np.argmax(outcome)
    oi = np.argmin(outcome)
    label = i_2_label[pi]
    score = (float(outcome[pi])/float(outcome[oi]))
    if score < 2.1:
        label = 'high'
    else:
        label = 'low'
    return [label, score]

In [410]:
scores = []
for fd in features['high']:
    c, score = classify(fd, wes, label_counts)
#     print(c)
    scores.append(score)
print(np.average(scores))

scores = []
for fd in features['low']:
    c, score = classify(fd, wes, label_counts)
#     print(c)
    scores.append(score)
print(np.average(scores))

1.66914953962
2.26330473542


In [411]:
accurate_surety = []
inaccurate_surety = []
def validate(af, wes,label_counts):
    classifications = {'low':0, 'high':0}
    correct_preds = 0
    all_preds = 0
    for d_item in af:
        my_dict = d_item['dictionary']
        my_label = d_item['label']
        [classification, score] = classify(my_dict, wes,label_counts)
        classifications[classification] = classifications[classification] + 1
        if classification == my_label:
            correct_preds = correct_preds + 1
        all_preds = all_preds + 1
    print(classifications)
    return float(correct_preds)/float(all_preds)

In [412]:
dictionary = all_features[4]['dictionary']

validate([{'dictionary': dictionary, 'label': 'high'}], wes,label_counts)


{'low': 0, 'high': 1}


1.0

In [413]:
def find_acc(data,wes,label_counts):
    for i in range(10):
        shuffle(all_features)
        wes = train_word_embeddings(all_features[0:n_train])
        print(validate(all_features[n_train:n_total], wes, label_counts))
        
find_acc(all_features, wes, label_counts)

  if __name__ == '__main__':


{'low': 51, 'high': 0}
0.49019607843137253
{'low': 51, 'high': 0}
0.5686274509803921
{'low': 50, 'high': 1}
0.5294117647058824
{'low': 51, 'high': 0}
0.5882352941176471
{'low': 51, 'high': 0}
0.5686274509803921
{'low': 51, 'high': 0}
0.5490196078431373
{'low': 51, 'high': 0}
0.5294117647058824
{'low': 51, 'high': 0}
0.6078431372549019
{'low': 51, 'high': 0}
0.5882352941176471
{'low': 51, 'high': 0}
0.6862745098039216
