In [1]:
import fasttext
import math
import pandas as pd

These functions for assessing word vector similarity are lifted from [these worked examples](https://medium.com/swlh/playing-with-word-vectors-308ab2faa519) without the type checking cruft; there are likely quicker ways!

In [2]:
def vector_len(v):
    return math.sqrt(sum([x*x for x in v]))

def dot_product(v1, v2):
    assert len(v1) == len(v2)
    return sum([x*y for (x,y) in zip(v1, v2)])

def cosine_similarity(v1, v2):
    """
    Returns the cosine of the angle between the two vectors.
    Results range from -1 (very different) to 1 (very similar).
    """
    return dot_product(v1, v2) / (vector_len(v1) * vector_len(v2))

def sorted_by_similarity(model, base_vector):
    """Returns words sorted by cosine distance to a given vector, most similar first"""
    words_with_distance = [(cosine_similarity(base_vector, model.get_word_vector(w)), w) for w in model.words]
    # We want cosine similarity to be as large as possible (close to 1)
    return sorted(words_with_distance, key=lambda t: t[0], reverse=True)

def related(model, text):
    sorted_words = [
        word for (dist, word) in
            sorted_by_similarity(model, model[text])
            if word.lower() != text.lower()
        ]
    print(', '.join(sorted_words[:20]))
    


In [None]:
data = pd.read_csv('output/results.csv')
with open('phrases.txt', 'w') as text:
    text.write('\n'.join(list(data['phrase'])))
    
model = fasttext.train_unsupervised('phrases.txt', model='skipgram')

In [None]:
model.save_model("microbial_mats.bin")

Given a word from the phrase collection, find others that have similar vectors. 

Almost certainly restating the obvious but you can see how we might combine the topic clusters and the word vectors to dig around a much larger corpus; or compare distances between geochemical terms and different concepts; or compare the differences in distances between phrase collections which are pinned to a geographic area. There is likely to be an area of related-but-not-too-related concepts which could show up interesting things.

In [None]:
related(model, 'microbialaminites')