### install dependencies
only do this once

In [1]:
#! python3 -m venv venv
#! source venv/bin/activate
#! pip3 install -r requirements.txt
#! pip install ipykernel
#! python -m ipykernel install --user
#! pip install -e ./
#! pip freeze

### imports

In [2]:
from builtins import range
import numpy as np
import pandas as pd
import os
import logging

In [3]:
from src.embedding_utilities import euclidean_dist, cosine_dist
from src.embedding import Embedding

In [4]:
logging.basicConfig(level=logging.DEBUG)

# define paths

In [5]:
#s3_client = boto3.client('s3')
BUCKET_NAME = 'ma-2021-07-word-embeddings'
FILE_NAME_EN = 'glove.6B.300d.txt'
FILE_NAME_DE = 'GloVe_vectors_de_50000.txt'

In [6]:
glove_path_en = ['../data/glove.6B.100d.txt']
#glove_path_de = ['../data/GloVe_vectors_de_part_01.txt', '../data/GloVe_vectors_de_part_02.txt']
glove_path_de = ['../data/GloVe_vectors_de_50000.txt']

### the cosine distance does not change when doubeling
cosine distance only takes into account angle (the smaller the higher the similarity.)

euclidian looks at absolute length

In [7]:
a = pd.Series([1,1])
b = pd.Series([1,2])
print(f'euclidian distance {euclidean_dist(a,b)}')
print(f'cosine distance {cosine_dist(a,b)}')

euclidian distance 1.0
cosine distance 0.05131670194948623


In [8]:
a_double = a*2
b_double = b*2
print(f'euclidian distance {euclidean_dist(a_double,b_double)}')
print(f'cosine distance {cosine_dist(a_double,b_double)}')

euclidian distance 2.0
cosine distance 0.05131670194948623


In [9]:
dist, metric = cosine_dist, 'cosine'
#alternative: dist, metric = dist1, 'euclidean'

### load pretrained word vectors
source: https://www.kaggle.com/anindya2906/glove6b

In [None]:
%time glove_embedding_en = Embedding(language='en', path_list=glove_path_en)

INFO:root:loading word embeddings word to vec from path ['../data/glove.6B.100d.txt']


In [None]:
# working but incredible slow:
# lokal: ca 10 sec for 400k
# read from aws s3: ca 40 sek for 4k => 4000 sek für 400k => faktor 400 schlechter
#%time glove_embedding_en = Embedding(language='en', path=f's3://{BUCKET_NAME}/{FILE_NAME_EN}')

In [None]:
print(glove_embedding_en.embedding.shape)
glove_embedding_en.embedding.head(1)

In [None]:
#1.3 Mio entries => takes about 1000sec on local machine 1309281, 360 secs after
#50k entries => takes 9 sec on local machine before, 6 secs after

#50k entries => takes about 9 mins when reading linewise from aws

# aws
#%time glove_embedding_de = Embedding(language='de', path_list = [f's3://{BUCKET_NAME}/{FILE_NAME_DE}'])

#lokal
%time glove_embedding_de = Embedding(language='de', path_list=glove_path_de)

In [None]:
print(glove_embedding_de.embedding.shape)
glove_embedding_de.embedding.head(2)

In [None]:
print(glove_embedding_en.word2vec['tree'][:10])
print(glove_embedding_de.word2vec['baum'][:10])

In [None]:
print(glove_embedding_en.index_to_word[1])
print(glove_embedding_de.index_to_word[1])

In [None]:
print(glove_embedding_en.embedding.shape)
print(glove_embedding_de.embedding.shape)

# Best of

In [None]:
glove_embedding_en.find_analogies('man', 'fun', 'woman')

In [None]:
glove_embedding_de.find_analogies('mann', 'spass', 'frau')

In [None]:
glove_embedding_en.find_analogies('woman', 'love', 'man')

In [None]:
glove_embedding_de.find_analogies('frau', 'liebe', 'mann')

In [None]:
glove_embedding_en.find_analogies('man', 'motivation', 'woman')

In [None]:
glove_embedding_de.find_analogies('mann', 'motivation', 'frau')

In [None]:
glove_embedding_en.find_analogies('woman', 'motivation', 'man')

In [None]:
glove_embedding_de.find_analogies('frau', 'motivation', 'mann')

In [None]:
glove_embedding_en.find_analogies('man', 'respect', 'woman')

In [None]:
glove_embedding_de.find_analogies('mann', 'respekt', 'frau')

In [None]:
glove_embedding_en.find_analogies('man', 'success', 'woman')
glove_embedding_de.find_analogies('mann', 'erfolg', 'woman')

In [None]:
glove_embedding_en.find_analogies('man', 'love', 'woman')
glove_embedding_de.find_analogies('mann', 'liebe', 'frau')

In [None]:
glove_embedding_en.find_analogies('woman', 'mother in law', 'man')
glove_embedding_de.find_analogies('frau', 'schwiegermutter', 'mann')
glove_embedding_de.find_analogies('mann', 'schwiegervater', 'frau')

In [None]:
glove_embedding_en.find_analogies('man', 'great', 'woman')
glove_embedding_de.find_analogies('mann', 'großartig', 'frau')

In [None]:
glove_embedding_en.find_analogies('women', 'kids', 'man')
glove_embedding_de.find_analogies('frau', 'kinder', 'mann')

In [None]:
glove_embedding_en.find_analogies('man', 'friends', 'woman')
glove_embedding_de.find_analogies('mann', 'freunde', 'frau')

In [None]:
glove_embedding_en.find_analogies('men', 'friends', 'women')
glove_embedding_de.find_analogies('männer', 'freunde', 'frauen')

In [None]:
glove_embedding_en.find_analogies('woman', 'safety', 'man')
glove_embedding_de.find_analogies('frau', 'sicherheit', 'mann')

In [None]:
glove_embedding_en.find_analogies('women', 'orgasm', 'men')
glove_embedding_de.find_analogies('frauen', 'orgasmus', 'männer')

In [None]:
glove_embedding_en.find_analogies('woman', 'money', 'man')
glove_embedding_de.find_analogies('frau', 'geld', 'mann')

In [None]:
# klappt einfach
glove_embedding_en.find_analogies('woman', 'man', 'she')
glove_embedding_en.find_analogies('woman', 'man', 'aunt')
glove_embedding_en.find_analogies('woman', 'man', 'sister')
glove_embedding_en.find_analogies('woman', 'man', 'actress')
glove_embedding_en.find_analogies('woman', 'man', 'mother')

In [None]:
# klappt einfach
glove_embedding_de.find_analogies('frau', 'mann', 'sie')
glove_embedding_de.find_analogies('frau', 'mann', 'tante')
glove_embedding_de.find_analogies('frau', 'mann', 'schwester')
glove_embedding_de.find_analogies('frau', 'mann', 'schauspielerin')
glove_embedding_de.find_analogies('frau', 'mann', 'mutter')

In [None]:
#klappt einfach
glove_embedding_en.find_analogies('paris', 'france', 'london')
glove_embedding_en.find_analogies('paris', 'france', 'rome')
glove_embedding_en.find_analogies('france', 'paris', 'italy')
glove_embedding_en.find_analogies('florida', 'miami', 'texas')

In [None]:
glove_embedding_de.find_analogies('frankreich', 'paris', 'london')
glove_embedding_de.find_analogies('frankreich', 'paris', 'rom')
glove_embedding_de.find_analogies('paris', 'frankreich', 'italien')
glove_embedding_de.find_analogies('miami', 'florida', 'texas')

In [None]:
#klappt einfach
glove_embedding_en.find_analogies('november', 'december', 'march')
glove_embedding_de.find_analogies('november', 'dezember', 'märz')

In [None]:
#klappt einfach
glove_embedding_en.find_analogies('einstein', 'scientist', 'dali')
glove_embedding_de.find_analogies('einstein', 'wissenschaftler', 'merkel')

In [None]:
# fun
glove_embedding_en.find_analogies('china', 'rice', 'germany')
glove_embedding_de.find_analogies('china', 'reis', 'deutschland')

In [None]:
glove_embedding_en.find_analogies('germany', 'beer', 'usa')
glove_embedding_de.find_analogies('deutschland', 'bier', 'usa')

In [None]:
glove_embedding_en.find_analogies('usa', 'freedom', 'germany')
glove_embedding_de.find_analogies('usa', 'freiheit', 'deutschland')

In [None]:
glove_embedding_en.find_analogies('builder', 'building', 'politician')
glove_embedding_en.find_analogies('writer', 'book', 'politician')
glove_embedding_de.find_analogies('bauarbeiter', 'baustelle', 'politiker')
glove_embedding_de.find_analogies('schriftsteller', 'buch', 'politiker')

In [None]:
glove_embedding_en.find_analogies('hacker', 'code', 'politician')
glove_embedding_de.find_analogies('hacker', 'code', 'politiker')

In [None]:
glove_embedding_en.find_analogies('trump', 'democracy', 'hitler')
glove_embedding_de.find_analogies('trump', 'demokratie', 'hitler')

In [None]:
glove_embedding_de.get_embedding('trump')

# nearest neighbors

In [None]:
glove_embedding_en.nearest_neighbors('france')

In [None]:
glove_embedding_de.nearest_neighbors('frankreich')

In [None]:
glove_embedding_en.nearest_neighbors('einstein')
glove_embedding_de.nearest_neighbors('einstein')

In [None]:
glove_embedding_en.nearest_neighbors('trump')
glove_embedding_de.nearest_neighbors('trump')

In [None]:
glove_embedding_en.nearest_neighbors('merkel')
glove_embedding_de.nearest_neighbors('merkel')

In [None]:
glove_embedding_en.nearest_neighbors('procrastination')
glove_embedding_de.nearest_neighbors('prokrastination')

In [None]:
glove_embedding_en.nearest_neighbors('religion')
glove_embedding_de.nearest_neighbors('religion')

In [None]:
nearest_neighbors('woman',index_to_word, word2vec)
nearest_neighbors('man',index_to_word, word2vec)
nearest_neighbors('nephew',index_to_word, word2vec)

In [None]:
nearest_neighbors('february',index_to_word, word2vec)
nearest_neighbors('success',index_to_word, word2vec)
nearest_neighbors('money',index_to_word, word2vec)
nearest_neighbors('love',index_to_word, word2vec)

In [None]:
nearest_neighbors('procrastination', index_to_word, word2vec)