# 5. Historical word embeddings

In [None]:
import numpy as np
import pandas as pd

from gensim.models import FastText

To improve the reproducibility of the embeddings (but only partially), we provide seeds:

In [None]:
np.random.seed(76352)
%env PYTHONHASHSEED=76352

## Diachronic word embeddings

In [None]:
eras = []

for fn in sorted(glob.glob('../data/eras_normalized/*.csv')):
    print(fn)
    eras.append(pd.read_csv(fn))

In [None]:
mfi = 250
seed = 12345
num_clust = 8
ft_size = 100
fig_dir = 'output'
model_dir = 'models'
num_era_neighbors = 100

In [None]:
pca = PCA(n_components=30, random_state=seed)
tsne = TSNE(n_components=2, random_state=seed)

Base model is last model!

In [None]:
base_cnt = Counter(eras[-1]['normalized'])
mfi = [i for i, _ in base_cnt.most_common(mfi)]
print('Most frequent items (base model):', mfi)

In [None]:
def extract_windows(era):
    windows = []
    tokens = tuple(era['normalized'])
    si, ei = 0, rnd_window_size
    while ei <= len(tokens):
        windows.append(tokens[si:ei])
        si += rnd_window_size
        ei += rnd_window_size
    return windows

In [None]:
models = []
for era in eras:
    windows = extract_windows(era)
    print(len(windows))
    model = FastText(min_count=min_count, vector_size=vector_size,
              sentences=windows, epochs=n_epochs, seed=65973273)
    models.append(model)

In [None]:
# align each era with the base model:
for idx, model in enumerate(models):
    print(idx)
    if idx == len(models)-1:  # skip base model
        break
    models[idx] = procrustes.smart_procrustes_align_gensim(models[-1], model)

In [None]:
trg = 'valuable'

# collect all nearest neighbors across models
neighborhoods = []
for model in models:
    neighbors = model.wv.most_similar(trg, topn=num_era_neighbors)
    print(neighbors)
    neighbors = [w for w, s in neighbors]
    neighborhoods.append(neighbors)

all_neighbors = tuple(set([w for hood in neighborhoods for w in hood]))
all_neighbors

In [None]:
# collect base vectors for the flattened neighborhood:
X, labels = [], []
for nn in all_neighbors:
    X.append(models[-1].wv[nn])
    labels.append(nn)

In [None]:
# collect model-specific vectors for neighborhood words:
arrow_idxs = []
for idx, model in enumerate(models):
    X.append(model.wv[trg])
    labels.append(trg + '\nSP' + str(idx + 1))
    arrow_idxs.append(len(labels) - 1)

In [None]:
X = np.array(X)
pca_X = pca.fit_transform(X)
tsne_X = tsne.fit_transform(pca_X)

In [None]:
fig, ax1 = plt.subplots(figsize=(14, 14))
ax1.axis('equal')
    
x1, x2 = tsne_X[:, 0], tsne_X[:, 1]
ax1.scatter(x1, x2, edgecolors='none', facecolors='none')
for x, y, w in zip(x1, x2, labels):
    if w.startswith(trg + '\n'):
        ax1.text(x, y, w, ha='center', va="center",
                 color='red')
        circle = plt.Circle((x, y), radius=1,
                            facecolor='lightgrey',
                            edgecolor='grey',
                            alpha=.9)
        ax1.add_artist(circle)
    else:
        ax1.text(x, y, w, ha='center', va="center",
                     color='black')
    
    plt.title(trg, fontdict={'family': 'Arial', 'size': 32})

    for idx, arrow in enumerate(arrow_idxs[::-1]):
        if idx == len(arrow_idxs) - 1:
            break
        ax1.annotate('', xy=tsne_X[arrow_idxs[idx + 1]], xytext=tsne_X[arrow_idxs[idx]],
                 arrowprops=dict(facecolor='darkgrey', shrink=0.05,
                                 width=2, headwidth=12, edgecolor=None))