In [1]:
import numpy as np
import random
from sklearn.metrics import accuracy_score

In [2]:
from athnlp.readers.brown_pos_corpus import BrownPosTag

In [3]:
corpus = BrownPosTag()

In [6]:
sentence.y

[6, 1, 0, 3, 8, 0, 3, 7, 8, 0, 3, 8, 5]

In [7]:
sentence.x

[795, 612, 356, 17, 1006, 12268, 10, 78, 164, 5029, 78, 1568, 15]

In [10]:
def sentence_to_vectors(sentence):
    word_vector_size = len(sentence.dictionary.x_dict)
    label_vector_size = len(sentence.dictionary.y_dict)
    word_vectors = []
    label_vectors = []
    for label_idx, word_idx in zip(sentence.y, sentence.x):
        word_vector = np.zeros(word_vector_size)
        word_vector[word_idx] = 1
        word_vectors.append(word_vector)
        label_vector = np.zeros(label_vector_size)
        label_vector[label_idx] = 1
        label_vectors.append(label_vector)
    return word_vectors, label_vectors

In [36]:
def extract_word_features(word):
    word_features = []
    
    feature_functions = [
       lambda w: w.endswith('ed'),
       lambda w: w.isdigit(),
       lambda w: w[0].isupper(),
       # get creative!
    ]
    
    for feature_fn in feature_functions:
        if feature_fn(word):
            word_features.append(1)
        else:
            word_features.append(0)
    
    return word_features

In [37]:
for word in ['hello', 'baked', '123', 'Abc']:
    print(extract_word_features(word))

[0, 0, 0]
[1, 0, 0]
[0, 1, 0]
[0, 0, 1]


In [32]:
def super_ouzo_sentence_to_vectors(sentence):
    word_vector_size = len(sentence.dictionary.x_dict)
    label_vector_size = len(sentence.dictionary.y_dict)
    word_vectors = []
    label_vectors = []
    for i in range(len(sentence.x)):
        word_idx = sentence.x[i]
        word_vector = np.zeros(word_vector_size)
        word_vector[word_idx] = 1
        word = sentence.dictionary.x_dict.get_label_name(word_idx)
        word_features = extract_word_features(word)
        word_vector = np.hstack([word_vector, word_features])
        word_vectors.append(word_vector)
        
        label_vector = np.zeros(label_vector_size)
        label_idx = sentence.y[i]
        label_vector[label_idx] = 1
        label_vectors.append(label_vector)
    return word_vectors, label_vectors

In [14]:
def sentence_to_vectors_multihot(sentence):
    word_vector_size = len(sentence.dictionary.x_dict)
    label_vector_size = len(sentence.dictionary.y_dict)
    word_vectors = []
    label_vectors = []
    for i in range(len(sentence.x)):
        word_idx = sentence.x[i]
        word_vector = np.zeros(word_vector_size)
        word_vector[word_idx] = 1
        
        try:
            word_vector[sentence.x[i+1]] = 0.05
        except IndexError:
            pass
        
        try:
            word_vector[sentence.x[i-1]] = 0.05
        except IndexError:
            pass
        
        try:
            word_vector[sentence.x[i+2]] = 0.01
        except IndexError:
            pass
            
        try:
            word_vector[sentence.x[i-2]] = 0.01
        except IndexError:
            pass
        
        word_vectors.append(word_vector)
        
        label_vector = np.zeros(label_vector_size)
        label_idx = sentence.y[i]
        label_vector[label_idx] = 1
        label_vectors.append(label_vector)
    return word_vectors, label_vectors

In [15]:
def prepare_data(dataset, sentence_to_vector_mapping_fn=sentence_to_vectors):
    word_vectors = []
    label_vectors = []
    for sentence in dataset:
        tmp_word_vectors, tmp_label_vectors = sentence_to_vector_mapping_fn(sentence)
        word_vectors.extend(tmp_word_vectors)
        label_vectors.extend(tmp_label_vectors)
    return word_vectors, label_vectors

In [16]:
def predict(word_vector, weight_vectors):
    dot_products = calculate_dotproducts(word_vector, weight_vectors)
    return np.argmax(dot_products)

In [17]:
def calculate_dotproducts(word_vector, weight_vectors):
    dot_products = []
    for weight_vector in weight_vectors:
        result = np.dot(weight_vector, word_vector)
        dot_products.append(result)
    return dot_products

In [18]:
def train(word_vectors, label_vectors, weight_vectors):
    for word_vec, label_vec in zip(word_vectors, label_vectors):

        predicted_label_idx = predict(word_vec, weight_vectors)
        true_label_idx = np.argmax(label_vec)
        
        if predicted_label_idx is not true_label_idx:
            weight_vectors[predicted_label_idx] -= word_vec
            weight_vectors[true_label_idx] += word_vec        
        
    return weight_vectors

In [19]:
def calculate_accuracy(word_vectors, label_vectors, weight_vectors):
    all_predictions = [predict(wv, weight_vectors) for wv in word_vectors]
    all_true_labels = [np.argmax(lv) for lv in label_vectors]
    return accuracy_score(all_predictions, all_true_labels)

In [20]:
def run_evaluation(train_word_vecs, train_label_vecs, test_word_vecs, test_label_vecs):
    num_features = len(train_word_vecs[0])
    num_labels = len(train_label_vecs[0])
    
    print('train on train, test on test')
    train_weights = [np.zeros(num_features) for _ in range(num_labels)]
    for epoch in range(4):
        train_weights = train(train_word_vecs, train_label_vecs, train_weights)
        accuracy = calculate_accuracy(test_word_vecs, test_label_vecs, train_weights)
        print(epoch, accuracy)

    print('train on train, test on test, randomize order')
    train_weights = [np.zeros(num_features) for _ in range(num_labels)]

    zipped_samples = zip(train_word_vecs, train_label_vecs)
    random_zipped_samples = sorted(zipped_samples, key=lambda k: random.random())
    train_word_vecs_random, train_label_vecs_random = zip(*random_zipped_samples)

    for epoch in range(4):
        train_weights = train(train_word_vecs_random, train_label_vecs_random, train_weights)
        accuracy = calculate_accuracy(test_word_vecs, test_label_vecs, train_weights)
        print(epoch, accuracy)
        
    print('train on train, test on test, randomize order, each epoch')
    train_weights = [np.zeros(num_features) for _ in range(num_labels)]

    for epoch in range(4):

        zipped_samples = zip(train_word_vecs, train_label_vecs)
        random_zipped_samples = sorted(zipped_samples, key=lambda k: random.random())
        train_word_vecs_random, train_label_vecs_random = zip(*random_zipped_samples)

        train_weights = train(train_word_vecs_random, train_label_vecs_random, train_weights)
        accuracy = calculate_accuracy(test_word_vecs, test_label_vecs, train_weights)
        print(epoch, accuracy)

In [21]:
test_word_vecs, test_label_vecs = prepare_data(corpus.test)
train_word_vecs, train_label_vecs = prepare_data(corpus.train)
run_evaluation(train_word_vecs, train_label_vecs, test_word_vecs, test_label_vecs)

train on train, test on test
0 0.8983333333333333
1 0.8983333333333333
2 0.8983333333333333
3 0.8983333333333333
train on train, test on test, randomize order
0 0.9001960784313725
1 0.9001960784313725
2 0.9001960784313725
3 0.9001960784313725
train on train, test on test, randomize order, each epoch
0 0.9003921568627451
1 0.9016666666666666
2 0.9055882352941177
3 0.9077450980392157


In [22]:
test_word_vecs, test_label_vecs = prepare_data(corpus.test, sentence_to_vector_mapping_fn=sentence_to_vectors_multihot)
train_word_vecs, train_label_vecs = prepare_data(corpus.train, sentence_to_vector_mapping_fn=sentence_to_vectors_multihot)
run_evaluation(train_word_vecs, train_label_vecs, test_word_vecs, test_label_vecs)

train on train, test on test
0 0.8427450980392157
1 0.8394117647058823
2 0.8420588235294117
3 0.8357843137254902
train on train, test on test, randomize order
0 0.8356862745098039
1 0.835
2 0.8281372549019608
3 0.8308823529411765
train on train, test on test, randomize order, each epoch
0 0.8291176470588235
1 0.8442156862745098
2 0.8441176470588235
3 0.8420588235294117


In [33]:
test_word_vecs, test_label_vecs = prepare_data(corpus.test, sentence_to_vector_mapping_fn=super_ouzo_sentence_to_vectors)
train_word_vecs, train_label_vecs = prepare_data(corpus.train, sentence_to_vector_mapping_fn=super_ouzo_sentence_to_vectors)
run_evaluation(train_word_vecs, train_label_vecs, test_word_vecs, test_label_vecs)

train on train, test on test
0 0.9047058823529411
1 0.9047058823529411
2 0.9051960784313725
3 0.9072549019607843
train on train, test on test, randomize order
0 0.9060784313725491
1 0.9033333333333333
2 0.903235294117647
3 0.9023529411764706
train on train, test on test, randomize order, each epoch
0 0.9110784313725491
1 0.9104901960784314
2 0.8787254901960785
3 0.9067647058823529
