In [1]:
from gensim.models.fasttext import load_facebook_vectors
from gensim.test.utils import datapath
import numpy as np
import matplotlib.pyplot as plt
import sklearn.cluster
import scipy.stats
import scipy.spatial.distance
import pandas as pd

# Load word embeddings (FastText model)

In [2]:
## specify path for embeddings file

# # these embeddings come with the gensim package
# cap_path = datapath("crime-and-punishment.bin")
cap_path = datapath("lee_fasttext.bin")

# # The following embeddings were downloaded from:
# # https://fasttext.cc/docs/en/pretrained-vectors.html
# # and saved to "data" directory.
# cap_path = "data/wiki/wiki.en.bin" 

## load the embeddings
fbkv = load_facebook_vectors(cap_path)
wind_terms = []
n_rows = 13
for i in np.arange(0,n_rows):
    rows_before = max(i, 0)
    rows_after = n_rows-(i+1)
    wind_terms.append(
        pd.read_csv("wind_terms_no_commas.txt", sep=", ",skiprows=rows_before, skipfooter = rows_after, header=None, engine="python")
    )

#### Look at a few embedding examples

In [7]:
print("correlation coefficients")
print(scipy.stats.pearsonr(fbkv["strong gust"], fbkv["heavy gale"])[0])
print(scipy.stats.pearsonr(fbkv["light breeze"], fbkv["heavy gale"])[0])
print()
print("cosine distances")
print(scipy.spatial.distance.cosine(fbkv["strong gust"], fbkv["heavy gale"]))
print(scipy.spatial.distance.cosine(fbkv["light breeze"], fbkv["heavy gale"]))

correlation coefficients
0.8776445185363406
0.9384719648907758

cosine distances
0.11786442995071411
0.06028550863265991


# Make training data by embedding the logbook data
(below, we're just using random embeddings)

In [8]:
n_samples = 1000
embedding_size = 10 # a.k.a. "n_features"

## fasttext: log entries -> embeddings
X_train = np.random.randn(n_samples, embedding_size)

# Train clustering model on embedded data

In [9]:
## initialize (untrained model)
kmeans_model = sklearn.cluster.KMeans(n_clusters=9, n_init="auto")

## fit the model to training data
kmeans_model.fit(X_train)

## cluster the training data
beaufort_predictions = kmeans_model.predict(X_train)