In [1]:
with open(file='/kaggle/input/moby-dick-herman-melville/melville-moby_dick.txt', mode='r', encoding='utf-8', ) as input_fp:
    data = input_fp.read()

Our text is oddly formatted; we need to parse it into sentences to use gensim. So we use a sentence parser from spacy.

In [2]:
from arrow import now
from spacy import load
from os.path import exists
time_start = now()
outfile = '/kaggle/working/moby-dick-formatted.txt'
if exists(path=outfile):
    with open(file=outfile, mode='r', encoding='utf-8') as input_fp:
        documents = input_fp.readlines()
else:
    spacy_model = load('en_core_web_sm')
    spacy_model.max_length = 1200000
    spacy_result = spacy_model(data.replace('\n', ' '))
    documents = [item.text for item in spacy_result.sents]
    with open(file=outfile, mode='w', encoding='utf-8') as output_fp:
        for document in documents:
            print(document, file=output_fp)
print(now() - time_start)

0:00:56.826030


In [3]:
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import preprocess_documents
# todo figure out how to remove the stemmer
texts = preprocess_documents(docs=documents)
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(dictionary)

Dictionary<10228 unique tokens: ['chapter', 'loom', 'ishmael', 'ago', 'have']...>


In [4]:
# this determines our runtime and also helps determine how many low-frequency tokens we keep
MAX_VOCAB_SIZE = 5000

In [5]:
from gensim.models import Word2Vec
time_start = now()
word2vec_model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=1, workers=4, seed=2023, max_vocab_size=MAX_VOCAB_SIZE)
print('vocabulary size: {}'.format(len(word2vec_model.wv)))
print(now() - time_start)

vocabulary size: 3913
0:00:00.560897


In [6]:
word2vec_model.wv.most_similar(topn=10, positive=['air'])

[('natur', 0.9998328685760498),
 ('gener', 0.9998181462287903),
 ('wild', 0.9998044967651367),
 ('half', 0.9998036623001099),
 ('place', 0.9997857213020325),
 ('heart', 0.9997849464416504),
 ('came', 0.9997835755348206),
 ('like', 0.9997822046279907),
 ('mark', 0.9997820854187012),
 ('world', 0.9997813701629639)]

In [7]:
from pandas import DataFrame
from sklearn.manifold import TSNE
time_start = now()
init = ['pca', 'random'][1] # choose this to see different shapes
tsne = TSNE(random_state=2023, n_iter=1000, verbose=1, init='random')
tsne_result = tsne.fit_transform(X=word2vec_model.wv.vectors)
tsne_df = DataFrame(data=tsne_result, columns=['x', 'y'])
tsne_df['word'] = list(word2vec_model.wv.key_to_index.keys())
tsne_df['weight'] = tsne_df['word'].apply(func=lambda x: word2vec_model.wv[x].sum())
print(now() - time_start)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 3913 samples in 0.002s...
[t-SNE] Computed neighbors for 3913 samples in 0.345s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3913
[t-SNE] Computed conditional probabilities for sample 2000 / 3913
[t-SNE] Computed conditional probabilities for sample 3000 / 3913
[t-SNE] Computed conditional probabilities for sample 3913 / 3913
[t-SNE] Mean sigma: 0.020899
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.295914
[t-SNE] KL divergence after 1000 iterations: 1.123957
0:00:19.241210


In [8]:
import pandas as pd
from plotly.express import scatter
scatter(data_frame=tsne_df, x='x', y='y', hover_name='word', color='weight')

Not surprisingly our TSNE model tracks our vector weights.

In [9]:
from sklearn.cluster import KMeans
N_CLUSTERS = 50
kmeans_model = KMeans(n_clusters=N_CLUSTERS, verbose=0, max_iter=1000, random_state=2023, n_init='auto')
kmeans_result = kmeans_model.fit_transform(X=word2vec_model.wv.vectors)
kmeans_df = tsne_df.copy()
kmeans_df['cluster'] = kmeans_model.labels_
scatter(data_frame=kmeans_df, x='x', y='y', hover_name='word', color='cluster')