In [None]:
import gensim
import json
import os
import csv

#
# Train Model
#

TRAIN_FILE = ""
OUTPUT_MODEL_FILE = ""

def load_data(train_file):
    if train_file is None:
        return 0
    with open(train_file, newline='') as csvfile:
        data = list(csv.reader(csvfile))
    
    return data

def train_model(dimensions):
    for dim in dims:
        sentences = load_data(TRAIN_FILE)
        model = gensim.models.Word2Vec(sentences, size = dim, window=5, min_count = 5, workers = 10)
        model.save(OUTPUT_MODEL_FILE+str(dim))
    
if __name__ == '__main__':
    dims = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    train_model(dims)
    print("Done")
        

In [None]:
import gensim
import json
import os
import csv

#
# Evaluate Model
#

TEST_FILE = ""
model_path = ""

#ind - the index that is being evaluated
ind = 5
prev = ind-1
maxl = ind+1

def evaluate_accuracy(dimensions):
    for dim in dims:
        model = gensim.models.Word2Vec.load(model_path+str(dim))
        lines = load_data(TEST_FILE)
        positive = 0.0
        total = 0
        # for each test sequence, retrieve the most similar vector to the last item for prediction
        for line in lines:
            if len(line) >= maxl:
                word = line[prev]
                actual = line[ind]
                if word in model.wv.vocab:
                    guess, _ = model.wv.most_similar(positive=word)[0]
                    if (guess == actual):
                        positive += 1
            total += 1

        accuracy = positive / total
        
        print("Model size: %d" % dim)
        print("Embedding 1N:-- %.8f" % accuracy)

          
def evaluate_recommended_MRR(num):
    model = gensim.models.Word2Vec.load(model_path)
    lines = load_data(TEST_FILE)
    total = 0
    mrr = 0.0
    for line in lines:
        if len(line) >= maxl:
            word = line[prev]
            actual = line[ind]
            if word in model.wv.vocab:
                top_20 = model.wv.most_similar(positive=word)[:num]
                for index, (guess, _) in list(enumerate(top_20)):
                    if actual == guess:
                        mrr += (1 / (index + 1))
        total += 1
    
    mrr = mrr / total
    print("Embedding Recommender MRR:-- %.8f" % mrr)
                     
def evaluate_recall(num):
    model = gensim.models.Word2Vec.load(model_path)
    lines = load_data(TEST_FILE)
    total = 0
    recall = 0.0
    for line in lines:
        if len(line) >= maxl + num:
            word = line[ind]
            rest_row = line[maxl:num+maxl]
            if word in model.wv.vocab:
                guesses = [x[0] for x in model.wv.most_similar(positive=word)[0:20]]
                intersect = len(list(set(guesses) & set(rest_row)))
                recall += intersect / 20.0
        total += 1
            
    recall = recall / total
    print("Embedding Recall:-- %.8f" % recall)

if __name__ == '__main__':
    dims = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    evaluate_accuracy(dims)
    evaluate_recommended_MRR(20)
    evaluate_recall(20)