# Tema 3: Embeddings Estáticos: Extensiones de Word2Vec

## Ejercicio 1: Sense2Vec

### Ejercicio 1.1

In [1]:
from sense2vec import Sense2Vec
from pathlib import Path

# Configurar rutas
PATH_MODELS = Path.cwd().parent / 'models'

s2v = Sense2Vec().from_disk(str(PATH_MODELS / "s2v_old"))

In [2]:
print('Watch as a NOUN:', s2v["watch|NOUN"][:5], '...')
print('Watch as a VERB:', s2v["watch|VERB"][:5], '...')

Watch as a NOUN: [-0.03000872 -0.07183316 -0.31021914  0.00982919 -0.4163286 ] ...
Watch as a VERB: [-0.18492278  0.05598612 -0.22331624 -0.15254268 -0.2843019 ] ...


### Ejercicio 1.2 y 1.3

In [3]:
import gensim.downloader as api
import spacy


w2v = api.load("word2vec-google-news-300")
nlp = spacy.load("en_core_web_sm")

In [4]:
sentences = [
    "You must watch the light movie.",
    "He gave me a watch with a light."
]

for sent in sentences:
    print('------------------------')
    print(sent)
    print('------------------------')
    for token in nlp(sent):
        if not token.is_stop and not token.is_punct:
            word = f'{token.text}|{token.pos_}'
            print(word)
            print('\t-> Sense2Vec:', s2v.most_similar(word, n=3))
            print('\t-> Word2Vec:', w2v.most_similar(token.text, topn=3))
    print()

------------------------
You must watch the light movie.
------------------------
watch|VERB
	-> Sense2Vec: [('watching|VERB', np.float32(0.8933)), ('watching|NOUN', np.float32(0.8348)), ('watched|VERB', np.float32(0.823))]
	-> Word2Vec: [('watching', 0.7835854291915894), ('watched', 0.6677262783050537), ('Watching', 0.6385796666145325)]
light|ADJ
	-> Sense2Vec: [('dark|ADJ', np.float32(0.8024)), ('bright|ADJ', np.float32(0.7952)), ('light|NOUN', np.float32(0.7695))]
	-> Word2Vec: [('lights', 0.550593912601471), ('yellowish_glow', 0.5484952926635742), ('illumination', 0.5342711806297302)]
movie|NOUN
	-> Sense2Vec: [('whole_movie|NOUN', np.float32(0.9032)), ('good_movie|NOUN', np.float32(0.9019)), ('terrible_movie|NOUN', np.float32(0.8995))]
	-> Word2Vec: [('film', 0.8676770329475403), ('movies', 0.8013108372688293), ('films', 0.7363011837005615)]

------------------------
He gave me a watch with a light.
------------------------
gave|VERB
	-> Sense2Vec: [('Gave|VERB', np.float32(0.8435

### Ejercicio 1.4

In [5]:
sentences = [
    "I went to the bank to deposit my money.",
    "The land along the river bank has vegetation."
]

for sent in sentences:
    print('------------------------')
    print(sent)
    print('------------------------')
    for token in nlp(sent):
        if not token.is_stop and not token.is_punct:
            word = f'{token.text}|{token.pos_}'
            print(word)
            print('\t-> Sense2Vec:', s2v.most_similar(word, n=3))
            print('\t-> Word2Vec:', w2v.most_similar(token.text, topn=3))
    print()

------------------------
I went to the bank to deposit my money.
------------------------
went|VERB
	-> Sense2Vec: [('went|ADJ', np.float32(0.8834)), ('came|VERB', np.float32(0.8803)), ("wen't|VERB", np.float32(0.8792))]
	-> Word2Vec: [('came', 0.7141857743263245), ('ran', 0.671501874923706), ('gone', 0.6404926776885986)]
bank|NOUN
	-> Sense2Vec: [('local_bank|NOUN', np.float32(0.8859)), ('bank_account|NOUN', np.float32(0.8653)), ('same_bank|NOUN', np.float32(0.8536))]
	-> Word2Vec: [('banks', 0.7440759539604187), ('banking', 0.690161406993866), ('Bank', 0.6698698401451111)]
deposit|VERB
	-> Sense2Vec: [('depositing|VERB', np.float32(0.8951)), ('Deposit|VERB', np.float32(0.8719)), ('deposite|VERB', np.float32(0.7975))]
	-> Word2Vec: [('deposits', 0.8111314177513123), ('Deposit', 0.7686398029327393), ('Deposits', 0.6425759196281433)]
money|NOUN
	-> Sense2Vec: [('_money|NOUN', np.float32(0.9145)), ('even_more_money|NOUN', np.float32(0.892)), ('own_money|NOUN', np.float32(0.8899))]
	-> Wo

### Ejercicio 1.5

In [6]:
from sklearn.metrics.pairwise import cosine_similarity


watch_noun = s2v["watch|NOUN"]
watch_verb = s2v["watch|VERB"]

clock_noun = s2v["clock|NOUN"]
view_verb = s2v["view|VERB"]

print('watch_noun vs watch_verb', cosine_similarity([watch_noun], [watch_verb]))
print()
print('watch_noun vs clock_noun', cosine_similarity([watch_noun], [clock_noun]))
print('watch_noun vs view_verb', cosine_similarity([watch_noun], [view_verb]))
print()
print('watch_verb vs clock_noun', cosine_similarity([watch_verb], [clock_noun]))
print('watch_verb vs view_verb', cosine_similarity([watch_verb], [view_verb]))

watch_noun vs watch_verb [[0.47977123]]

watch_noun vs clock_noun [[0.5055921]]
watch_noun vs view_verb [[0.38052857]]

watch_verb vs clock_noun [[0.32959634]]
watch_verb vs view_verb [[0.43523753]]


## Ejercicio 2: Doc2Vec

### Ejercicio 2.1 y 2.2

In [7]:
from pprint import pprint

from datasets import load_dataset
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [8]:
dataset = load_dataset("laion/Wikipedia-Abstract", "English")
subset = dataset["train"].select(range(10000))

texts, titles = subset["Abstract"], subset["Title"]

documents = []
for text, title in zip(texts, titles):
    documents.append(TaggedDocument(words=word_tokenize(text), tags=[title]))



In [9]:
model = Doc2Vec(vector_size=50, epochs=40, seed=42)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

### Ejercicio 2.3

In [10]:
wiki_1 = "Federico García Lorca"
print('-------------------------------------------------')
print(wiki_1)
print('-------------------------------------------------')
pprint(model.dv.most_similar(positive=[wiki_1], topn=10))

print()

wiki_1 = "Flag of Europe"
print('-------------------------------------------------')
print(wiki_1)
print('-------------------------------------------------')
pprint(model.dv.most_similar(positive=[wiki_1], topn=10))

print()

wiki_1 = "Super Mario 64"
print('-------------------------------------------------')
print(wiki_1)
print('-------------------------------------------------')
pprint(model.dv.most_similar(positive=[wiki_1], topn=10))

-------------------------------------------------
Federico García Lorca
-------------------------------------------------
[('Mohammad-Hossein Shahriar', 0.7323083281517029),
 ('Jacques de Mahieu', 0.7190210223197937),
 ('Esoteric Neo-Nazism', 0.6896402835845947),
 ('Victor Leemans', 0.6675578355789185),
 ('El Cid', 0.6634325385093689),
 ("Ken'ichi Enomoto", 0.6590982675552368),
 ('Benedetto Gennari', 0.6524226665496826),
 ('Jan Philip van Thielen', 0.6470993757247925),
 ('Albrecht Adam', 0.6456478238105774),
 ('Giuseppe Mazzuoli (c. 1536 – 1589)', 0.6403161287307739)]

-------------------------------------------------
Flag of Europe
-------------------------------------------------
[('Flag of Bohol', 0.7047033309936523),
 ('Flag of the Federal Territories', 0.6650753617286682),
 ('Accession of Albania to the European Union', 0.6437662839889526),
 ('Euroscepticism', 0.628987193107605),
 ('Serbia Davis Cup team', 0.5973030924797058),
 ('Treaty of Prague (1973)', 0.5958016514778137),
 ('1

### Ejercicio 2.4

In [11]:
text = """Mycology is the branch of biology concerned with the study of fungi, including their taxonomy, genetics, biochemical properties, and use by humans.[1] Fungi can be a source of tinder, food, traditional medicine, as well as entheogens, poison, and infection. Yeasts are among the most heavily utilized members of the Kingdom Fungi, particularly in food manufacturing.[2]

Mycology branches into the field of phytopathology, the study of plant diseases. The two disciplines are closely related, because the vast majority of plant pathogens are fungi. A biologist specializing in mycology is called a mycologist
"""

vector = model.infer_vector(word_tokenize(text))

print(vector.shape, vector)
print()
pprint(model.dv.most_similar([vector], topn=10))

(50,) [-0.4026926   0.581282    0.40740594  0.0430668  -1.0033398  -1.4692134
 -0.43142968  1.0989631   0.42855978 -0.8597266  -0.29421526  0.8062698
  0.17859367  2.3086727  -0.964565   -0.36739975 -1.1685674   1.0733256
  0.6134671   1.4051841   1.7117182   0.18356495  0.32591483 -0.7381212
  0.53039116  0.12287422 -0.07131319 -0.68314594  0.160554    0.61600083
 -0.23626818  0.65588164 -0.66059977  2.33449     0.5977025  -0.07788242
  0.10233663 -0.6689951   0.6470793  -0.52845174  1.3158622   0.07683377
  2.1582968  -0.3269521   0.10931224 -1.3920735   0.61149037  0.13574648
  1.1888418  -1.7444515 ]

[('Phytosulfokine', 0.7384074330329895),
 ('Sulfolobaceae', 0.6606782674789429),
 ('Grebe', 0.6436593532562256),
 ('Iridoid', 0.6341902017593384),
 ('Callus (cell biology)', 0.6155226230621338),
 ('Lyngbya', 0.6137983202934265),
 ('Lightweight programming language', 0.5977791547775269),
 ('Outline of zoology', 0.5888161063194275),
 ('Loxoscelism', 0.585954487323761),
 ('Methanospirill

### Ejercicio 2.5

In [12]:
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show
from sklearn.decomposition import PCA


selection_subset = subset.select(range(500))
texts, titles = list(selection_subset["Abstract"]), list(selection_subset["Title"])

doc_vectors = [
    model.infer_vector(word_tokenize(text))
    for text in texts
]

pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(doc_vectors)

source = ColumnDataSource({'x': vectors_2d[:, 0], 'y': vectors_2d[:, 1], 'title': titles})

p = figure(tools="pan,wheel_zoom,reset", width=1700, height=1000)
p.scatter(x='x', y='y', size=10, source=source, fill_alpha=0.6)
p.text(x='x', y='y', text='title', source=source, x_offset=5, y_offset=5)

show(p)