In [1]:
from gensim.models import FastText
from gensim.models.fasttext import load_facebook_vectors
from gensim.test.utils import datapath
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans 
import numpy as np
import matplotlib.pyplot as plt
import sklearn.cluster
import scipy.stats
import scipy.spatial.distance
import pandas as pd

# Load word embeddings (FastText model)

In [2]:
## specify path for embeddings file

# # these embeddings come with the gensim package
# cap_path = datapath("crime-and-punishment.bin")
cap_path = datapath("lee_fasttext.bin")

# # The following embeddings were downloaded from:
# # https://fasttext.cc/docs/en/pretrained-vectors.html
# # and saved to "data" directory.
# cap_path = "data/wiki/wiki.en.bin" 

## load the embeddings
fbkv = load_facebook_vectors(cap_path)
wind_terms = []
n_rows = 13
for i in np.arange(0,n_rows):
    rows_before = max(i, 0)
    rows_after = n_rows-(i+1)
    wind_terms.append(
        pd.read_csv("wind_terms_no_commas.txt", sep=", ",skiprows=rows_before, skipfooter = rows_after, header=None, engine="python")
    )

#categories = ["calm", "light air", "light breeze", "gentle breeze", "moderate breeze", "fresh breeze",
    #"strong breeze", "near gale", "gale", "strong gale", "storm", "violent storm", "hurricane"]

fbkv_list = []
for term in wind_terms:
    fbkv_list.append(fbkv[term])

# remove all the previous existing rows, make it one big vector
fbkv_vector = []
for i in range(0,13):
    row_length = len(fbkv_list[i])
    for j  in range(0, row_length):
        fbkv_vector.append(fbkv_list[i][j])
    #wind_array = np.asarray(fbkv_list, dtype="object")

#df = pd.DataFrame(wind_array) 

#### Look at a few embedding examples

In [3]:
print("correlation coefficients")
print(scipy.stats.pearsonr(fbkv["strong gust"], fbkv["heavy gale"])[0])
print(scipy.stats.pearsonr(fbkv["light breeze"], fbkv["heavy gale"])[0])
print()
print("cosine distances")
print(scipy.spatial.distance.cosine(fbkv["strong gust"], fbkv["heavy gale"]))
print(scipy.spatial.distance.cosine(fbkv["light breeze"], fbkv["heavy gale"]))

correlation coefficients
0.8776445185363406
0.9384719648907758

cosine distances
0.11786442995071411
0.06028550863265991


# Make training data by embedding the logbook data
(below, we're just using random embeddings)

In [4]:
X_train = fbkv_vector

# Train clustering model on embedded data

In [5]:
## initialize (untrained model)
kmeans_model = sklearn.cluster.KMeans(n_clusters=9, n_init="auto")

## fit the model to training data
kmeans_model.fit(X_train)

## cluster the training data
beaufort_predictions = kmeans_model.predict(X_train)

beaufort_predictions

array([6, 3, 8, ..., 3, 2, 5], dtype=int32)

# Plot the cluster