In [1]:
import numpy as np
from numba import jit, njit
from nltk import pos_tag
from nltk.cluster import KMeansClusterer, euclidean_distance, cosine_distance
from nltk.corpus import brown, product_reviews_2
import multiprocessing

from DocumentBasedVectorizer import DocumentBasedVectorizer
from ContextBasedVectorizer import ContextBasedVectorizer

In [2]:
target_words = ['abstraction', 'actually', 'add', 'address', 'answer',
                'argument', 'arguments', 'back', 'call', 'car', 'case',
                'cdr', 'computer', 'course', 'dictionary', 'different',
                'evaluator', 'function', 'general', 'got', 'idea', 'kind',
                'lambda', 'machine', 'mean', 'object', 'operator', 'order',
                'pair', 'part', 'particular', 'pattern', 'place', 'problem',
                'process', 'product', 'program', 'reason', 'register',
                'result', 'set', 'simple', 'structure', 'system', 'they',
                'together', 'using', 'variable', 'why', 'zero']

tagged_target_words = pos_tag(target_words)

In [3]:
def evaluate(vectorizer_type, target_words, corpus, tagged=False, weighting='binary', window_size=10, stem=False, lemmatize=False, remove_stopwords=False):
    sum = 0
    for i in range(0, 5):
        if (vectorizer_type == 'document'):
            vectorizer = DocumentBasedVectorizer(target_words=target_words,
                                                 corpus=corpus,
                                                 tagged=tagged,
                                                 weighting=weighting,
                                                 stem=stem,
                                                 lemmatize=lemmatize,
                                                 remove_stopwords=remove_stopwords,
                                                 for_evaluation=True)
        else:
            vectorizer = ContextBasedVectorizer(target_words=target_words,
                                                corpus=corpus,
                                                tagged=tagged,
                                                stem=stem,
                                                lemmatize=lemmatize,
                                                remove_stopwords=remove_stopwords,
                                                window_size=window_size,
                                                for_evaluation=True)

        vectors = vectorizer.vectorize()
        clusterer = KMeansClusterer(50, euclidean_distance, avoid_empty_clusters=True)
        clusterer.cluster_vectorspace(list(vectors.values()))

        appended = []
        for word in target_words:
            appended.append(word)

        for word in target_words:
            appended.append(word[::-1])

        vectors = list(vectors.values())
        correct = 0
        result = []
        for i in range(0, len(target_words)):
            word_vector = vectors[i]
            reversed_word_vector = vectors[i + len(target_words)]
            result.append([{appended[i]: clusterer.classify(word_vector)}, {appended[i + len(target_words)]: clusterer.classify(reversed_word_vector)}])
            if (clusterer.classify(word_vector) == clusterer.classify(reversed_word_vector)):
                correct += 1

        sum += (correct / len(target_words)) * 100
    
    return sum / 5

In [4]:
jit_evaluate = jit()(evaluate)

In [6]:
output = {}
for window_size in range(2, 11):
    output[window_size] = str(jit_cluster_and_evaluate(window_size)) + '%'

output
# pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
# manager = multiprocessing.Manager()
# evaluation_result = manager.dict()

# for window_size in range(2, 10):
#     pool.apply_async(cluster_and_evaluate, args=(window_size, evaluation_result))

# pool.close()
# pool.join()
    
# evaluation_result

{2: '64.0%',
 3: '46.0%',
 4: '36.0%',
 5: '32.0%',
 6: '32.0%',
 7: '28.000000000000004%',
 8: '26.0%',
 9: '26.0%'}

In [None]:
for window_size in range(2, 11):
    average_accuracy = jit_evaluate(vectorizer_type='context',
                                    target_words=target_words,
                                    corpus=brown,
                                    window_size=window_size)
    print('(context based, brown corpus, window size = ' + window_size + 'words) average accuracy = ' + str(average_accuracy) + '%')

Compilation is falling back to object mode WITH looplifting enabled because Function "evaluate" failed type inference due to: Untyped global name 'DocumentBasedVectorizer': Cannot determine Numba type of <class 'type'>

File "<ipython-input-3-3a156c67b650>", line 5:
def evaluate(vectorizer_type, target_words, corpus, tagged=False, weighting='binary', window_size=10, stem=False, lemmatize=False, remove_stopwords=False):
    <source elided>
        if (vectorizer_type == 'document'):
            vectorizer = DocumentBasedVectorizer(target_words=target_words,
            ^

  def evaluate(vectorizer_type, target_words, corpus, tagged=False, weighting='binary', window_size=10, stem=False, lemmatize=False, remove_stopwords=False):
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "evaluate" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "<ipython-input-3-3a156c67b650>", line 3:
def evaluate(