# Classification Tree

In [1]:
import pandas as pd
import numpy as np
import glob as glob
import json
import sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from random import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score, explained_variance_score


import keras
from keras.layers import Dense, Dropout, RepeatVector, BatchNormalization, Convolution1D, Flatten, Lambda, Permute, MaxPooling1D, AlphaDropout
from keras.models import Sequential
from keras.utils import to_categorical
import keras.backend as K
from sklearn.model_selection import train_test_split
from keras.models import load_model

Using TensorFlow backend.


In [2]:
root_path = 'features_by_valence/'
label_set = ['low', 'high']
low_files = glob.glob(root_path + label_set[0] + '/*')
high_files = glob.glob(root_path + label_set[1] + '/*')

features = {'low':[], 'high':[]}

all_features = []

for filename in low_files:
    word_histogram = json.loads(open(filename).read())
    features['low'].append(word_histogram)
    all_features.append({'dictionary': word_histogram, 'label': 'low'})
for filename in high_files:
    word_histogram = json.loads(open(filename).read())
    features['high'].append(word_histogram)
    all_features.append({'dictionary': word_histogram, 'label': 'high'})

In [3]:
n_total = int(len(all_features))
n_train = int(n_total * 0.8)
n_test = int(n_total * 0.2)

print('total: ' + str(n_total))
print('train: ' + str(n_train))
print('test: ' + str(n_test))

total: 252
train: 201
test: 50


In [4]:
# one hot encoding

def word_list(percent_occurance):
    key_set = {}

    for feature_d in all_features:
        keys = feature_d['dictionary'].keys()
        for key in keys:
            key_set[key] = key_set.get(key, 0) + 1
    key_set
    # Return words that occur in the top percent_occurance% of documents
    key_set = dict([[key, key_set[key]] for key in key_set if (key_set[key]/float(n_total) > float(percent_occurance))])

    words_sorted_alphabetically = list(key_set.keys())
    words_sorted_alphabetically.sort()
    return words_sorted_alphabetically

In [5]:
def get_index(reference, key):
    try:
        return reference.index(key)
    except:
        return len(reference)

In [70]:
ohe = {}
i_2_label = {}

def one_hot_encode(reference, dictionary):
    ohe = np.zeros(len(reference) + 1)
    for key in dictionary.keys():
        pos = get_index(reference, key)
        ohe[pos] = dictionary[key]
    norm_lookup = float(np.linalg.norm(ohe))
    normalized_ohe = [float(item)/float(norm_lookup) for item in ohe]
    return normalized_ohe

In [7]:
def encode(y):
    if y == 'high':
        return 1
    else:
        return 0
    
def decode(y):
    if y == 1:
        return 'high'
    else:
        return 'low'

In [8]:
def validate(af):
    classifications = {'low':0, 'high':0}
    correct_preds = 0
    all_preds = 0
    for d_item in af:
        feature = d_item['ohe_feature']
        label = d_item['ohe_label']
        classification = clf.predict([feature])[0]
        classification = decode(classification)
        classifications[classification] = classifications[classification] + 1
        if classification == label:
            correct_preds = correct_preds + 1
        all_preds = all_preds + 1
    print(classifications)
    return float(correct_preds)/float(all_preds)

In [9]:
def preds_and_true(clf, af, ohe_words):
    correct_preds = 0
    all_preds = 0
    preds = []
    true = []
    for d_item in af:
        feature = d_item['ohe_feature']
        label = d_item['ohe_label']
        classification = clf.predict([feature])[0]
        preds.append(classification)
        true.append( label )
    return [preds, true]

In [10]:
def encode_features_and_words(all_features, word_threshold):
    ohe_words = word_list(word_threshold)
    
    for i, f in enumerate(all_features):
        ohe_feature = one_hot_encode(ohe_words, f['dictionary'])
        ohe_label = encode(f['label'])
        all_features[i]['ohe_feature'] = ohe_feature
        all_features[i]['ohe_label'] = ohe_label
    return [all_features, ohe_words]


In [71]:
def model(all_features, ohe_words):
    rounds = []
    for i in range(20):
        shuffle(all_features)
        
        train_features = all_features[0:n_train]
        test_features = all_features[n_train:n_total]
    
        X = [f['ohe_feature'] for f in train_features]
        y = [f['ohe_label'] for f in train_features]
        
        clf = GradientBoostingClassifier(random_state=0, learning_rate=0.005)
        clf.fit(X, y)
        
        preds, true = preds_and_true(clf, train_features, ohe_words)
        train_precision = precision_score(true, preds)
        train_acc = accuracy_score(true, preds)
        tr_ev = explained_variance_score(true,preds)
        
        preds, true = preds_and_true(clf, test_features, ohe_words)
        test_precision = precision_score(true, preds)
        test_acc = accuracy_score(true, preds)
        te_ev = explained_variance_score(true, preds)
        
        rounds.append({
            'train_precision': train_precision, 
            'train_acc': train_acc, 
            'tr_ev': tr_ev,
            'test_precision': test_precision,
            'test_acc': test_acc, 
            'te_ev': te_ev
            
        })
    totals = {}
    output = {}
    for ro in rounds: 
        for k in ro:
            ro_total_key = 'total-' + k
            totals[ro_total_key] = totals.get(ro_total_key, 0) + ro[k]
    for k in totals:
        ou_av_key = 'av-' + k
        output[ou_av_key] = (totals[k]/float(len(rounds)))
    return output
        

In [72]:
n = 0.25
all_features, ohe_words = encode_features_and_words(all_features, n)
model(all_features, ohe_words)

{'av-total-te_ev': -0.64557735201626365,
 'av-total-test_acc': 0.53921568627450966,
 'av-total-test_precision': 0.38383470695970695,
 'av-total-tr_ev': 0.60164987608211518,
 'av-total-train_acc': 0.88781094527363202,
 'av-total-train_precision': 0.99683335480454038}

In [13]:
# clf = GradientBoostingClassifier(random_state=0, learning_rate=0.005)
# clf.fit(X, y)

preds, true = preds_and_true(clf, train_features, ohe_words)
train_precision = precision_score(true, preds)
train_acc = accuracy_score(true, preds)
tr_ev = explained_variance_score(true,preds)

preds, true = preds_and_true(clf, test_features, ohe_words)
test_precision = precision_score(true, preds)
test_acc = accuracy_score(true, preds)
te_ev = explained_variance_score(true, preds)
print(str(n) + '.)' + "\n")
print({'train_precision': train_precision, 
    'train_acc': train_acc, 
    'tr_ev': tr_ev,
    'test_precision': test_precision,
    'test_acc': test_acc, 
    'te_ev': te_ev})

NameError: name 'X' is not defined

In [73]:
# n = 0.001
len(word_list(0.25))

2822

# Let's see if we can use deep learning to do the same thing

In [74]:
def get_train_test(all_features, n=0.25):
    all_features, ohe_words = encode_features_and_words(all_features, n)

    shuffle(all_features)

    train_features = all_features[0:n_train]
    test_features = all_features[n_train:n_total]

    X = [f['ohe_feature'] for f in train_features]
    y = [f['ohe_label'] for f in train_features]
    
    x_val = [f['ohe_feature'] for f in test_features]
    y_val = [f['ohe_label'] for f in test_features]
    return [X, y, x_val, y_val]

In [50]:
# 0.25 is a solid value for n. Lower values mean letting in larger amounts of words (features)
# Once validation accuracy is 66.67%, save the weights. That's the max. 
n = 0.25
X, y, x_val, y_val = get_train_test(all_features, n)

In [75]:
shape = len(word_list(n)) + 1
shape

2823

In [106]:
shape = len(word_list(n)) + 1

m1 = Sequential([
    BatchNormalization(input_shape=(shape,)),
    Dense(40, activation='relu'),
    Dropout(0.5),
    BatchNormalization(),
    Dense(10, activation='relu'),
    Dropout(0.5),
    BatchNormalization(center=True),
    Dense(1, activation='sigmoid')   
])

m1.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

sched = [[0.0001, 2], [0.001, 20], [0.01, 2], [0.1, 2], [0.5, 1], [0.1, 5], [0.01, 20], [0.001, 40], [0.0001, 80], [0.00005, 120]]


In [107]:
for info in sched:
    lr, epochs = info
    m1.optimizer.lr = lr
    m1.fit(np.array(X), np.array(y), epochs=epochs,  batch_size=64, validation_data=(np.array(x_val), np.array(y_val)))

Train on 201 samples, validate on 51 samples
Epoch 1/2
Epoch 2/2
Train on 201 samples, validate on 51 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 201 samples, validate on 51 samples
Epoch 1/2
Epoch 2/2
Train on 201 samples, validate on 51 samples
Epoch 1/2
Epoch 2/2
Train on 201 samples, validate on 51 samples
Epoch 1/1
Train on 201 samples, validate on 51 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 201 samples, validate on 51 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 201 samples, validate on 51 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epo

Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train on 201 samples, validate on 51 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80


Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
Train on 201 samples, validate on 51 samples
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120


Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120


Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78/120
Epoch 79/120
Epoch 80/120
Epoch 81/120
Epoch 82/120
Epoch 83/120
Epoch 84/120
Epoch 85/120
Epoch 86/120
Epoch 87/120
Epoch 88/120
Epoch 89/120
Epoch 90/120
Epoch 91/120
Epoch 92/120
Epoch 93/120
Epoch 94/120
Epoch 95/120
Epoch 96/120
Epoch 97/120
Epoch 98/120
Epoch 99/120
Epoch 100/120
Epoch 101/120
Epoch 102/120
Epoch 103/120
Epoch 104/120
Epoch 105/120
Epoch 106/120
Epoch 107/120
Epoch 108/120
Epoch 109/120
Epoch 110/120
Epoch 111/120
Epoch 112/120
Epoch 113/120
Epoch 114/120
Epoch 115/120
Epoch 116/120
Epoch 117/120
Epoch 118/120
Epoch 119/120
Epoch 120/120
