In [2]:
import glob
import re
import pandas as pd
import spacy
import os
import gensim
from gensim.models.word2vec import Word2Vec
import itertools 

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [18]:
import numpy as np
import gensim
from gensim.models.keyedvectors import KeyedVectors
from scipy.spatial.distance import cosine as cosine_distance

In [19]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
	"""Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
	Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
		(With help from William. Thank you!)
	First, intersect the vocabularies (see `intersection_align_gensim` documentation).
	Then do the alignment on the other_embed model.
	Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
	Return other_embed.
	If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
	"""

	# make sure vocabulary and indices are aligned
	in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

	# get the embedding matrices
	base_vecs = in_base_embed.syn0norm
	other_vecs = in_other_embed.syn0norm

	# just a matrix dot product with numpy
	m = other_vecs.T.dot(base_vecs) 
	# SVD method from numpy
	u, _, v = np.linalg.svd(m)
	# another matrix operation
	ortho = u.dot(v) 
	# Replace original array with modified one
	# i.e. multiplying the embedding matrix (syn0norm)by "ortho"
	other_embed.syn0norm = other_embed.syn0 = (other_embed.syn0norm).dot(ortho)
	return other_embed
	
def intersection_align_gensim(m1,m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
        The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.vocab.keys())
    vocab_m2 = set(m2.vocab.keys())

    # Find the common vocabulary
    common_vocab = vocab_m1&vocab_m2
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1-common_vocab and not vocab_m2-common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.vocab[w].count + m2.vocab[w].count,reverse=True)

    # Then for each model...
    for m in [m1,m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.vocab[w].index for w in common_vocab]
        old_arr = m.syn0norm
        new_arr = np.array([old_arr[index] for index in indices])
        m.syn0norm = m.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.index2word = common_vocab
        old_vocab = m.vocab
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.vocab = new_vocab

    return (m1,m2)

In [89]:
embeddings_t0 = KeyedVectors.load('embeddings/year1950_1954')
embeddings_t1 = KeyedVectors.load('embeddings/year1955_1959')
embeddings_t2 = KeyedVectors.load('embeddings/year1960_1964')

2018-11-20 15:51:59,437 : INFO : loading Word2VecKeyedVectors object from embeddings/year1950_1954
2018-11-20 15:52:00,680 : INFO : loading vectors from embeddings/year1950_1954.vectors.npy with mmap=None
2018-11-20 15:52:01,038 : INFO : setting ignored attribute vectors_norm to None
2018-11-20 15:52:01,051 : INFO : loaded embeddings/year1950_1954
2018-11-20 15:52:01,057 : INFO : loading Word2VecKeyedVectors object from embeddings/year1955_1959
2018-11-20 15:52:02,731 : INFO : loading vectors from embeddings/year1955_1959.vectors.npy with mmap=None
2018-11-20 15:52:03,054 : INFO : setting ignored attribute vectors_norm to None
2018-11-20 15:52:03,055 : INFO : loaded embeddings/year1955_1959
2018-11-20 15:52:03,055 : INFO : loading Word2VecKeyedVectors object from embeddings/year1960_1964
2018-11-20 15:52:04,945 : INFO : loading vectors from embeddings/year1960_1964.vectors.npy with mmap=None
2018-11-20 15:52:05,477 : INFO : setting ignored attribute vectors_norm to None
2018-11-20 15:5

In [90]:
embeddings_t0.most_similar('efficiency', topn=10)

2018-11-20 15:52:06,168 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('opvoering_productiviteit', 0.6525278091430664),
 ('specialisatie', 0.6410766839981079),
 ('arbeidsmethoden', 0.625503420829773),
 ('scanlonplan', 0.6220992207527161),
 ('contactgroep', 0.6033715009689331),
 ('vakopleiding', 0.5976119637489319),
 ('researchwerk', 0.5959469676017761),
 ('productiviteitsbevordering', 0.5956337451934814),
 ('bedrijfseconomische', 0.592207670211792),
 ('sociologisch', 0.5915866494178772)]

In [123]:
embeddings_t1.most_similar('verlegen', topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('ninette', 0.6749332547187805),
 ('wildvreemde', 0.6445732116699219),
 ('buiging', 0.6306825876235962),
 ('duifje', 0.6290490627288818),
 ('agnese', 0.6255329251289368),
 ('halfgesloten', 0.6245224475860596),
 ('timothy', 0.6174279451370239),
 ('bevend', 0.6103349328041077),
 ('aude', 0.6077667474746704),
 ('fluisterde', 0.6060428619384766)]

In [92]:
cosine_distance(embeddings_t0['efficiency'], embeddings_t1['efficiency'])

1.0033900181297213

In [93]:
np.linalg.norm(embeddings_t0['efficiency'] - embeddings_t1['efficiency'])

3.26904

In [94]:
embeddings_t1.similar_by_vector(embeddings_t0['efficiency'] - embeddings_t0['mechanisatie']  + embeddings_t1['efficiency'])

  if np.issubdtype(vec.dtype, np.int):


[('efficiency', 0.6797057390213013),
 ('produktiviteitsverhoging', 0.40548431873321533),
 ('bedrijfsvoering', 0.3934084475040436),
 ('specialisatie', 0.38317373394966125),
 ('kunnen_bijdragen', 0.38296619057655334),
 ('rentabiliteit', 0.38255566358566284),
 ('investeringsplannen', 0.38196367025375366),
 ('kostprijs', 0.3814176917076111),
 ('rationalisatie', 0.37920260429382324),
 ('bijdraagt', 0.3788200616836548)]

In [95]:
embeddings_t0.syn0.shape

  """Entry point for launching an IPython kernel.


(148610, 300)

In [96]:
embeddings_t1.syn0.shape

  """Entry point for launching an IPython kernel.


(193355, 300)

In [97]:
embeddings_t1 = smart_procrustes_align_gensim(embeddings_t0, embeddings_t1)

  app.launch_new_instance()


In [98]:
embeddings_t0.most_similar('efficiency', topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('specialisatie', 0.6410766839981079),
 ('arbeidsmethoden', 0.625503420829773),
 ('contactgroep', 0.6033715009689331),
 ('vakopleiding', 0.5976119637489319),
 ('researchwerk', 0.5959469676017761),
 ('bedrijfseconomische', 0.592207670211792),
 ('sociologisch', 0.5915866494178772),
 ('kuylaars', 0.5900152325630188),
 ('isonevo', 0.587445855140686),
 ('arbeidsverdeling', 0.5862185955047607)]

In [99]:
embeddings_t1.most_similar('efficiency', topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('bedrijfsvoering', 0.5882034301757812),
 ('specialisatie', 0.5584152936935425),
 ('mechanisering', 0.5562975406646729),
 ('conjunctureel', 0.5534877777099609),
 ('rationalisatie', 0.5502891540527344),
 ('wollenstoffenindustrie', 0.5411832332611084),
 ('zelffinanciering', 0.5397161841392517),
 ('nive', 0.5385679006576538),
 ('afremming', 0.530324399471283),
 ('doelmatig', 0.5275052189826965)]

In [100]:
cosine_distance(embeddings_t0['efficiency'], embeddings_t1['efficiency'])

0.5153142809867859

In [101]:
np.linalg.norm(embeddings_t0['efficiency'] - embeddings_t1['efficiency'])

1.015199

In [102]:
embeddings_t1.similar_by_vector(embeddings_t0['efficiency'] - embeddings_t0['mechanisatie']  + embeddings_t1['efficiency'])

  if np.issubdtype(vec.dtype, np.int):


[('efficiency', 0.6923547983169556),
 ('rdinerend', 0.509129524230957),
 ('bedrijfspsychologie', 0.4966466426849365),
 ('beroepskeuzewerk', 0.4787091910839081),
 ('nive', 0.4739752411842346),
 ('salari', 0.46876242756843567),
 ('voorlichtende', 0.46149322390556335),
 ('rdineren', 0.4612298309803009),
 ('overheidsbeleid', 0.4576718807220459),
 ('vragenlijst', 0.45497041940689087)]

In [103]:
christianity = ['doop',
                'messias', 
                'katholicisme', 
                'christendom', 
                'verlossing',
                'protestant', 
                'jezus', 
                'christus', 
                'kerk']

In [104]:
print(embeddings_t0[christianity].var())
print(embeddings_t1[christianity].var())
print(embeddings_t2[christianity].var())

0.0033330207
0.0033333316
0.026532503


In [105]:
embeddings = [embeddings_t0, embeddings_t1, embeddings_t2]

In [106]:
def get_counts_dictionary(vocabd, neutwords):
    dwords = {}
    if vocabd is None or len(vocabd) == 0: return {}
    for en in range(len(vocabd)):
        if vocabd[en] is None: return {}
    for word in neutwords:
        dwords[word] = [vocabd[en].get(word, 0) for en in range(len(vocabd))]
    return dwords

In [107]:
variances = []
for embedding in embeddings: 
    avgvar = np.mean(np.var(np.array([embedding[word] for word in christianity]), axis = 0))
    variances.append(avgvar)



In [108]:
print("vocab_size: {}".format(len(embeddings_t0.vocab)))

vocab_size: 112043


In [109]:
word_counts = {}
for word in christianity:
    word_counts[word] = embeddings_t0.wv.vocab[word].count

  This is separate from the ipykernel package so we can avoid doing imports until


In [110]:
word_counts

{'doop': 142,
 'messias': 36,
 'katholicisme': 278,
 'christendom': 665,
 'verlossing': 127,
 'protestant': 151,
 'jezus': 528,
 'christus': 1796,
 'kerk': 7249}

In [111]:
toset = [[] for _ in range(len(embeddings))]
toset_cossim = [[] for _ in range(len(embeddings))]

toset_averageothersetfirst = [[] for _ in range(len(embeddings))]
toset_cossim_averageothersetfirst = [[] for _ in range(len(embeddings))]

toset_averagetargetsetfirst = [[] for _ in range(len(embeddings))]
toset_cossim_averagetargetsetfirst = [[] for _ in range(len(embeddings))]

In [120]:
def cossim(v1, v2, signed = True):
    c = np.dot(v1, v2)/np.linalg.norm(v1)/np.linalg.norm(v2)
    if not signed:
        return abs(c)
    return c

def calc_distance_between_vectors(vec1, vec2, distype = 'norm'):
    if distype is 'norm':
        return np.linalg.norm(np.subtract(vec1, vec2))
    else:
        return cossim(vec1, vec2)

def calc_distance_between_words(vectors, word1, word2, distype = 'norm'):
        if word1 in vectors and word2 in vectors:
            if distype is 'norm':
                return np.linalg.norm(np.subtract(vectors[word1], vectors[word2]))
            else:
                return cossim(vectors[word1], vectors[word2])
        return np.nan

def calc_distance_over_time(vectors_over_time, word1, word2, distype = 'norm', vocabd = None, word1lims = [50, 1e25], word2lims = [50, 1e25]):
    ret = []
    for en,vectors in enumerate(vectors_over_time):
        if vocabd is None or vocabd[en] is None:
            ret.append(calc_distance_between_words(vectors, word1, word2, distype))
        elif (vocabd is not None and vocabd[en] is not None and (word1 in vocabd[en] and word2 in vocabd[en])):
            if (vocabd[en][word1] < word1lims[0] or vocabd[en][word2] < word2lims[0] or vocabd[en][word1] > word1lims[1] or vocabd[en][word2] > word2lims[1]):
                ret.append(np.nan)
            else:
                ret.append(calc_distance_between_words(vectors, word1, word2, distype))
        else:
            ret.append(calc_distance_between_words(vectors, word1, word2, distype))

    return ret

def calc_distance_over_time_averagevectorsfirst(vectors_over_time, words_to_average_1, words_to_average_2, distype = 'norm', vocabd = None, word1lims = [50, 1e25], word2lims = [50, 1e25]):
    retbothaveraged = []
    retfirstaveraged = []
    retsecondaveraged = []

    for en,vectors in enumerate(vectors_over_time):
        validwords1 = []
        validwords2 = []
        for word in words_to_average_1:
            if vocabd is not None and vocabd[en] is not None and word in vocabd[en] and word in vectors_over_time[en]:
                if vocabd[en][word] < word1lims[0] or vocabd[en][word] > word1lims[1]: continue
                validwords1.append(word)
            elif (vocabd is None or vocabd[en] is None) and word in vectors_over_time[en]:
                validwords1.append(word)


        for word in words_to_average_2:
            if vocabd is not None and vocabd[en] is not None and word in vocabd[en] and word in vectors_over_time[en]:
                if vocabd[en][word] < word2lims[0] or vocabd[en][word] > word2lims[1]: continue
                validwords2.append(word)
            elif (vocabd is None or vocabd[en] is None) and word in vectors_over_time[en]:
                validwords2.append(word)
        #if lengths of the valids are 0, distance is nan
        if len(validwords1) == 0 or len(validwords2) == 0:
            retbothaveraged.append(np.nan)
            retfirstaveraged.append(np.nan)
            retsecondaveraged.append(np.nan)
        else:
            average_vector_1 = np.mean(np.array([vectors[word] for word in validwords1]), axis = 0)
            average_vector_2 = np.mean(np.array([vectors[word] for word in validwords2]), axis = 0)

            retbothaveraged.append(calc_distance_between_vectors(average_vector_1,average_vector_2, distype))
            retfirstaveraged.append(np.mean([calc_distance_between_vectors(average_vector_1,vectors[word], distype) for word in validwords2]))
            retsecondaveraged.append(np.mean([calc_distance_between_vectors(vectors[word], average_vector_2, distype) for word in validwords1]))

    return retbothaveraged, retfirstaveraged, retsecondaveraged

In [119]:
calc_distance_over_time(embeddings, 'oorlog', 'vrede')

[1.1355627, 1.0508329, 2.3080525]

In [121]:
calc_distance_over_time_averagevectorsfirst(embeddings, ['oorlog', 'bommen'], ['vrede', 'zingen'])

([0.9678258, 0.97814745, 2.3160548],
 [1.1714698, 1.1677287, 2.7886038],
 [1.1386201, 1.1396813, 2.639306])

In [114]:
calc_distance_between_vectors(embeddings_t0['oorlog'], embeddings_t0['auto'])

1.3883764

In [115]:
np.linalg.norm(embeddings_t0['oorlog'] - embeddings_t0['auto'])

1.3883764

In [14]:
for t0_path, t1_path in [('embeddings/vecs_1945-55.bin', 'embeddings/vecs_1965-75.bin')]:
    embeddings_t0 = KeyedVectors.load_word2vec_format(t0_path, binary=True, unicode_errors='ignore')
    embeddings_t1 = KeyedVectors.load_word2vec_format(t1_path, binary=True, unicode_errors='ignore')
    embeddings_t0.init_sims(replace=True)
    embeddings_t1.init_sims(replace=True)
    embeddings_t1 = smart_procrustes_align_gensim(embeddings_t0, embeddings_t1)
    embeddings_t0.save('{0}_aligned'.format(t0_path))
    embeddings_t1.save('{0}_aligned'.format(t1_path))

2018-10-04 10:33:27,077 : INFO : loading projection weights from embeddings/vecs_1945-55.bin


















2018-10-04 10:33:30,064 : INFO : duplicate words detected, shrinking matrix size from 81063 to 80352
2018-10-04 10:33:30,064 : INFO : loaded (80352, 100) matrix from embeddings/vecs_1945-55.bin
2018-10-04 10:33:30,082 : INFO : loading projection weights from embeddings/vecs_1965-75.bin












2018-10-04 10:33:32,692 : INFO : duplicate words detected, shrinking matrix size from 99886 to 99374
2018-10-04 10:33:32,693 : INFO : loaded (99374, 100) matrix from embeddings/vecs_1965-75.bin
2018-10-04 10:33:32,708 : INFO : precomputing L2-norms of word weight vectors
2018-10-04 10:33:33,376 : INFO : precomputing L2-norms of word weight vectors
  app.launch_new_instance()
2018-10-04 10:33:34,919 : INFO : saving Word2VecKeyedVectors object under embeddings/vecs_1945-55.bin_aligned, separately None


2018-10-04 10:33:34,919 : INFO : not storing attribute vectors_norm
2018-10-04 10:33:35,283 : INFO : saved embeddings/vecs_1945-55.bin_aligned
2018-10-04 10:33:35,285 : INFO : saving Word2VecKeyedVectors object under embeddings/vecs_1965-75.bin_aligned, separately None
2018-10-04 10:33:35,287 : INFO : not storing attribute vectors_norm
2018-10-04 10:33:35,594 : INFO : saved embeddings/vecs_1965-75.bin_aligned
