In [1]:
# Dataset is glove 6B 50T
# udemy nlp with python

### install dependencies
only do this once

In [2]:
#! python3 -m venv venv
#! source venv/bin/activate
#! pip3 install -r requirements.txt
#! pip install ipykernel
#! python -m ipykernel install --user
#! pip install -e ./
#! pip freeze

### imports

In [3]:
from builtins import range
import numpy as np
import pandas as pd
import os

In [4]:
from src.embedding_utilities import euclidean_dist, cosine_dist
from src.embedding import Embedding

In [5]:
import boto3
from botocore.exceptions import ClientError

# connect to s3

In [6]:
#s3_client = boto3.client('s3')
BUCKET_NAME = 'ma-2021-07-word-embeddings'
FILE_NAME_EN = 'glove.6B.300d.txt'
FILE_NAME_DE = 'GloVe_vectors_de.txt'

In [7]:
glove_path_en = '../data/glove.6B.100d.txt'
glove_path_de = '../data/GloVe_vectors_de.txt'

### the cosine distance does not change when doubeling
cosine distance only takes into account angle (the smaller the higher the similarity.)

euclidian looks at absolute length

In [8]:
a = pd.Series([1,1])
b = pd.Series([1,2])
print(f'euclidian distance {euclidean_dist(a,b)}')
print(f'cosine distance {cosine_dist(a,b)}')

euclidian distance 1.0
cosine distance 0.05131670194948623


In [9]:
a_double = a*2
b_double = b*2
print(f'euclidian distance {euclidean_dist(a_double,b_double)}')
print(f'cosine distance {cosine_dist(a_double,b_double)}')

euclidian distance 2.0
cosine distance 0.05131670194948623


In [10]:
dist, metric = cosine_dist, 'cosine'
#alternative: dist, metric = dist1, 'euclidean'

### load pretrained word vectors
source: https://www.kaggle.com/anindya2906/glove6b

In [11]:
%time glove_embedding_en = Embedding(language='en', path=glove_path_en)

loading word embeddings word to vec from path ../data/glove.6B.100d.txt, num lines: 0


400000it [00:07, 52615.12it/s]


total number of entries found:  400000. Dimension: 100
CPU times: user 21.5 s, sys: 1.01 s, total: 22.5 s
Wall time: 22.5 s


In [12]:
# working but incredible slow:
# lokal: ca 10 sec for 400k
# read from aws s3: ca 40 sek for 4k => 4000 sek für 400k => faktor 400 schlechter
#%time glove_embedding_en = Embedding(language='en', path=f's3://{BUCKET_NAME}/{FILE_NAME_EN}')

In [13]:
print(glove_embedding_en.embedding.shape)
glove_embedding_en.embedding.head(1)

(400000, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
the,-0.038194,-0.24487,0.72812,-0.39961,0.083172,0.043953,-0.39141,0.3344,-0.57545,0.087459,...,0.016215,-0.017099,-0.38984,0.87424,-0.72569,-0.51058,-0.52028,-0.1459,0.8278,0.27062


In [14]:
glove_embedding_de = Embedding(language='de', path=glove_path_de)
print(glove_embedding_de.embedding.shape)
glove_embedding_de.embedding.head(1)

loading word embeddings word to vec from path ../data/GloVe_vectors_de.txt, num lines: 0


1309281it [01:04, 20310.47it/s]


total number of entries found:  1309281. Dimension: 300
(1309281, 300)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
der,-0.28062,0.74426,0.257364,-0.070038,-0.07914,-0.134922,0.418708,0.373415,0.074699,-0.791163,...,-0.138816,-0.45058,-0.136059,-0.005833,-0.425212,-0.04491,0.159463,-0.4145,0.046645,0.069972


In [15]:
print(glove_embedding_en.word2vec['tree'][:10])
print(glove_embedding_de.word2vec['baum'][:10])

[-0.95006   1.1876   -0.15589  -0.074026  0.5079    0.31601  -0.54892
  0.067332  0.15188  -0.5513  ]
[-0.400554  0.139754 -0.357781  0.45136  -0.178334  0.052695 -0.75287
 -0.285123  0.048494  0.015669]


In [16]:
print(glove_embedding_en.index_to_word[1])
print(glove_embedding_de.index_to_word[1])

,
und


In [17]:
print(glove_embedding_en.embedding.shape)
print(glove_embedding_de.embedding.shape)

(400000, 100)
(1309281, 300)


# Best of

In [18]:
glove_embedding_en.find_analogies('man', 'fun', 'woman')
glove_embedding_de.find_analogies('mann', 'spass', 'frau')

man - fun = woman - kids
mann - spass = frau - schwiegermutter


'schwiegermutter'

In [19]:
glove_embedding_en.find_analogies('woman', 'love', 'man')
glove_embedding_de.find_analogies('frau', 'liebe', 'mann')

woman - love = man - me
frau - liebe = mann - mein


'mein'

In [20]:
glove_embedding_en.find_analogies('man', 'motivation', 'woman')
glove_embedding_de.find_analogies('mann', 'motivation', 'frau')

man - motivation = woman - desire
mann - motivation = frau - beeinflussen


'beeinflussen'

In [21]:
glove_embedding_en.find_analogies('woman', 'motivation', 'man')
glove_embedding_de.find_analogies('frau', 'motivation', 'mann')

woman - motivation = man - motive
frau - motivation = mann - stärke


'stärke'

In [22]:
glove_embedding_en.find_analogies('man', 'respect', 'woman')
glove_embedding_de.find_analogies('mann', 'respekt', 'frau')

man - respect = woman - dignity
mann - respekt = frau - wertschätzung


'wertschätzung'

In [23]:
glove_embedding_en.find_analogies('man', 'success', 'woman')
glove_embedding_de.find_analogies('mann', 'erfolg', 'woman')

man - success = woman - achieved
mann - erfolg = woman - hit


'hit'

In [24]:
glove_embedding_en.find_analogies('man', 'love', 'woman')
glove_embedding_de.find_analogies('mann', 'liebe', 'frau')

man - love = woman - mother
mann - liebe = frau - mutter


'mutter'

In [25]:
glove_embedding_en.find_analogies('woman', 'mother in law', 'man')
glove_embedding_de.find_analogies('frau', 'schwiegermutter', 'mann')

mother in law not in dictionary
frau - schwiegermutter = mann - steinreicher


'steinreicher'

In [26]:
glove_embedding_en.find_analogies('man', 'great', 'woman')
glove_embedding_de.find_analogies('mann', 'großartig', 'frau')

man - great = woman - little
mann - großartig = frau - langweilig


'langweilig'

In [27]:
glove_embedding_en.find_analogies('women', 'kids', 'man')
glove_embedding_de.find_analogies('frau', 'kinder', 'mann')

women - kids = man - kid
frau - kinder = mann - kindern


'kindern'

In [28]:
glove_embedding_en.find_analogies('man', 'friends', 'woman')
glove_embedding_de.find_analogies('mann', 'freunde', 'frau')

man - friends = woman - parents
mann - freunde = frau - freundin


'freundin'

In [29]:
glove_embedding_en.find_analogies('men', 'friends', 'women')
glove_embedding_de.find_analogies('männer', 'freunde', 'frauen')

men - friends = women - parents
männer - freunde = frauen - ihrer


'ihrer'

In [30]:
glove_embedding_en.find_analogies('woman', 'safety', 'man')
glove_embedding_de.find_analogies('frau', 'sicherheit', 'mann')

woman - safety = man - security
frau - sicherheit = mann - ausrüstung


'ausrüstung'

In [31]:
glove_embedding_en.find_analogies('women', 'orgasm', 'men')
glove_embedding_de.find_analogies('frauen', 'orgasmus', 'männer')

women - orgasm = men - ejaculation
frauen - orgasmus = männer - hengzt


'hengzt'

In [32]:
glove_embedding_en.find_analogies('woman', 'money', 'man')
glove_embedding_de.find_analogies('frau', 'geld', 'mann')

woman - money = man - cash
frau - geld = mann - bezahlen


'bezahlen'

In [33]:
# klappt einfach
glove_embedding_en.find_analogies('woman', 'man', 'she')
glove_embedding_en.find_analogies('woman', 'man', 'aunt')
glove_embedding_en.find_analogies('woman', 'man', 'sister')
glove_embedding_en.find_analogies('woman', 'man', 'actress')
glove_embedding_en.find_analogies('woman', 'man', 'mother')

woman - man = she - he
woman - man = aunt - uncle
woman - man = sister - brother
woman - man = actress - actor
woman - man = mother - father


'father'

In [34]:
# klappt einfach
glove_embedding_de.find_analogies('frau', 'mann', 'sie')
glove_embedding_de.find_analogies('frau', 'mann', 'tante')
glove_embedding_de.find_analogies('frau', 'mann', 'schwester')
glove_embedding_de.find_analogies('frau', 'mann', 'schauspielerin')
glove_embedding_de.find_analogies('frau', 'mann', 'mutter')

frau - mann = sie - aber
frau - mann = tante - freund
frau - mann = schwester - bruder
frau - mann = schauspielerin - schauspieler
frau - mann = mutter - vater


'vater'

In [35]:
#klappt einfach
glove_embedding_en.find_analogies('prais', 'france', 'london')
glove_embedding_en.find_analogies('paris', 'france', 'rome')
glove_embedding_en.find_analogies('france', 'paris', 'italy')
glove_embedding_en.find_analogies('florida', 'miami', 'texas')

prais not in dictionary
paris - france = rome - italy
france - paris = italy - rome
florida - miami = texas - dallas


'dallas'

In [36]:
glove_embedding_de.find_analogies('frankreich', 'paris', 'london')
glove_embedding_de.find_analogies('frankreich', 'paris', 'rom')
glove_embedding_de.find_analogies('paris', 'frankreich', 'italien')
glove_embedding_de.find_analogies('miami', 'florida', 'texas')

frankreich - paris = london - york
frankreich - paris = rom - florenz
paris - frankreich = italien - spanien
miami - florida = texas - alabama


'alabama'

In [37]:
#klappt einfach
glove_embedding_en.find_analogies('november', 'december', 'march')
glove_embedding_de.find_analogies('november', 'dezember', 'märz')

november - december = march - july
november - dezember = märz - februar


'februar'

In [38]:
#klappt einfach
glove_embedding_en.find_analogies('einstein', 'scientist', 'painter')
glove_embedding_de.find_analogies('einstein', 'wissenschaftler', 'maler')

einstein - scientist = painter - sculptor
einstein - wissenschaftler = maler - künstler


'künstler'

In [39]:
# fun
glove_embedding_en.find_analogies('rice', 'china', 'germany')
glove_embedding_de.find_analogies('reis', 'china', 'deutschland')

rice - china = germany - austria
reis - china = deutschland - volksrepublik


'volksrepublik'

In [40]:
glove_embedding_en.find_analogies('beer', 'germany', 'usa')
glove_embedding_de.find_analogies('bier', 'deutschland', 'usa')

beer - germany = usa - switzerland
bier - deutschland = usa - staaten


'staaten'

In [41]:
glove_embedding_en.find_analogies('freedom', 'usa', 'germany')
glove_embedding_de.find_analogies('freiheit', 'usa', 'deutschland')

freedom - usa = germany - switzerland
freiheit - usa = deutschland - kanada


'kanada'

In [42]:
glove_embedding_en.find_analogies('building', 'builder', 'politician')
glove_embedding_en.find_analogies('book', 'writer', 'politician')
glove_embedding_de.find_analogies('baustelle', 'bauarbeiter', 'politiker')
glove_embedding_de.find_analogies('buch', 'schriftsteller', 'politiker')

building - builder = politician - jurist
book - writer = politician - businessman
baustelle - bauarbeiter = politiker - jurist
buch - schriftsteller = politiker - jurist


'jurist'

In [43]:
glove_embedding_en.find_analogies('code', 'hacker', 'politician')
glove_embedding_de.find_analogies('code', 'hacker', 'politiker')

code - hacker = politician - businessman
code - hacker = politiker - jurist


'jurist'

In [44]:
# addition
glove_embedding_de.get_embedding('Frau')

sorry, word Frau not in index


# nearest neighbors

In [45]:
glove_embedding_en.nearest_neighbors('france')

neighbors of: france
	belgium
	french
	britain
	spain
	paris


array([2975,  348,  695, 1029, 1035])

In [46]:
glove_embedding_de.nearest_neighbors('frankreich')

neighbors of: frankreich
	italien
	französischen
	großbritannien
	spanien
	belgien


array([ 480,  457, 1095,  804, 1423])

In [47]:
glove_embedding_en.nearest_neighbors('einstein')
glove_embedding_de.nearest_neighbors('einstein')

neighbors of: einstein
	relativity
	freud
	physics
	bohr
	theory
neighbors of: einstein
	einsteins
	physik
	heisenberg
	poincaré
	mileva


array([ 48160,   2371,  28521,  31601, 250921])

In [48]:
glove_embedding_en.nearest_neighbors('trump')
glove_embedding_de.nearest_neighbors('trump')

neighbors of: trump
	nows
	hilton
	casino
	ivanka
	mih
neighbors of: trump
	trumps
	obama
	barack
	bush
	clinton


array([43972, 13007, 19208,  7553,  9118])

In [49]:
glove_embedding_en.nearest_neighbors('merkel')
glove_embedding_de.nearest_neighbors('merkel')

neighbors of: merkel
	schroeder
	sarkozy
	gerhard
	kohl
	chirac
neighbors of: merkel
	bundeskanzlerin
	bundeskanzler
	angela
	schröder
	kanzlerin


array([29678, 10167,  5671,  4245, 81235])

In [50]:
glove_embedding_en.nearest_neighbors('procrastination')
glove_embedding_de.nearest_neighbors('prokrastination')

neighbors of: procrastination
	obstinacy
	stubbornness
	intransigence
	inflexibility
	obfuscation
neighbors of: prokrastination
	huraa
	trichodynie
	browserweichen
	rangierfeld
	terekura


array([1264153, 1162555, 1177566, 1047241, 1161246])

In [51]:
glove_embedding_en.nearest_neighbors('religion')
glove_embedding_de.nearest_neighbors('religion')

neighbors of: religion
	religions
	christianity
	religious
	beliefs
	spirituality
neighbors of: religion
	christentum
	religionen
	glauben
	islam
	glaube


array([4607, 6201, 2933, 3476, 7079])

In [52]:
nearest_neighbors('woman',index_to_word, word2vec)
nearest_neighbors('man',index_to_word, word2vec)
nearest_neighbors('nephew',index_to_word, word2vec)

NameError: name 'nearest_neighbors' is not defined

In [None]:
nearest_neighbors('february',index_to_word, word2vec)
nearest_neighbors('success',index_to_word, word2vec)
nearest_neighbors('money',index_to_word, word2vec)
nearest_neighbors('love',index_to_word, word2vec)

In [None]:
nearest_neighbors('procrastination', index_to_word, word2vec)