In [3]:
import os

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import numpy as np

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Explore vecmap alignments

In [5]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from numpy.linalg import svd

def load_embeddings(file_path, source, target, base_path="/content/drive/MyDrive/analogy/s2orc/20200705v1/full/level1.abstract.api/MUSE/"):
  src_vectors = KeyedVectors.load_word2vec_format(base_path+file_path+'vectors-'+source+'.txt', binary=False)
  tgt_vectors = KeyedVectors.load_word2vec_format(base_path+file_path+'vectors-'+target+'.txt', binary=False)
  return src_vectors, tgt_vectors

In [6]:
base_url = "/content/drive/MyDrive/analogy/s2orc/20200705v1/full"

base_url_with_vecmap = base_url + "/level1.abstract.api/vecmap"

paths = {
    'hci-ling': ('participant.v6__hci--ling', 'hci', 'ling'),
    'hci-edu': ('participant.v6__hci--edu', 'hci', 'edu'),
    'ling-hci': ('participant.v6__ling--hci', 'ling', 'hci'),
    'ling-edu': ('participant.v6__ling--edu', 'ling', 'edu'),
    'ling-imm': ('participant.v6__ling--imm', 'ling', 'imm'),
    'edu-hci': ('participant.v6__edu--hci', 'edu', 'hci'),
    'edu-ling': ('participant.v6__edu--ling', 'edu', 'ling'),
    'hci-imm': ('participant.v6__hci--imm', 'hci', 'imm'),
    'hci-eth': ('participant.v6__hci--eth', 'hci', 'eth'),
    'hci-pro': ('participant.v6__hci--pro', 'hci', 'pro'),
    'ling-eth': ('participant.v6__ling--eth', 'ling', 'eth'),
    'ling-pro': ('participant.v6__ling--pro', 'ling', 'pro'),
    'edu-imm': ('participant.v6__edu--imm', 'edu', 'imm'),
    'edu-eth': ('participant.v6__edu--eth', 'edu', 'eth'),
    'edu-pro': ('participant.v6__edu--pro', 'edu', 'pro'),
}
community_names = ['hci', 'ling', 'edu', 'pro', 'eth', 'imm']
embeds = {}
for pair, (path, src, tgt) in paths.items():
    print(f"loading {src}-{tgt}...")
    embeds[src] = embeds.get(src, {})
    embeds[src][tgt] = {}

    embeds[src][tgt]["src_emb"] = KeyedVectors.load_word2vec_format(os.path.join(base_url_with_vecmap, f"{path}/vectors-{src}.txt"), binary=False)
    embeds[src][tgt]["tgt_emb"] = KeyedVectors.load_word2vec_format(os.path.join(base_url_with_vecmap, f"{path}/vectors-{tgt}.txt"), binary=False)


loading hci-ling...
loading hci-edu...
loading ling-hci...
loading ling-edu...
loading ling-imm...
loading edu-hci...
loading edu-ling...
loading hci-imm...
loading hci-eth...
loading hci-pro...
loading ling-eth...
loading ling-pro...
loading edu-imm...
loading edu-eth...
loading edu-pro...


In [10]:
def word_most_similar_same_emb(emb, word, n=10):
  if word in emb:
    return emb.most_similar(positive=[word], topn=n)
  elif ' ' in word:
    phrase = '__'.join(word.split(' '))
    if phrase in emb:
      return emb.most_similar(positive=[phrase], topn=n)
  return None

def src_word_most_similar_in_tgt(src_emb, tgt_emb, src_word, n=10):
  if src_word in src_emb:
    return tgt_emb.most_similar(positive=[ src_emb[src_word] ], topn=n)
  elif ' ' in src_word:
    """
    words = src_word.split(' ')
    avg_emb = None
    w_cnt = 0
    for w in words:
      if w not in src_emb:
        continue
      if avg_emb is None:
        avg_emb = np.array(src_emb[w])
      else:
        avg_emb += src_emb[w]
      w_cnt += 1
    if avg_emb is not None:
      avg_emb /= w_cnt
      return tgt_emb.most_similar(positive=[ avg_emb ], topn=n)
    """
    phrase = '__'.join(src_word.split(' '))
    if phrase in src_emb:
      return tgt_emb.most_similar(positive=[ src_emb[src_word] ], topn=n)
  return None

def src_word_rank_sim_in_tgt(src_emb, tgt_emb, src_word):
  if src_word not in src_emb:
    return None, None
  for rank, w_sim in enumerate(tgt_emb.most_similar(positive=[ src_emb[src_word] ], topn=100000)):
    tgt_word, sim = w_sim
    if tgt_word == src_word:
      return rank, sim
  # match not found in topn
  return None, None

In [11]:
src_community = "hci" #choose one of: hci, edu, ling
tgt_community = "eth" #choose onf of: hci, edu, ling, pro, eth, imm

src_embed = embeds[src_community][tgt_community]["src_emb"]
tgt_embed = embeds[src_community][tgt_community]["tgt_emb"]

word = "AI-mediated__communication"
cross_sim_list = src_word_most_similar_in_tgt(src_embed, tgt_embed, word)
cross_sim_list
# TODO: Make it easy to hook up with db
# if cross_sim_list is None:
#     pass
# else:
#     cross_res = []
#     for w, sim in cross_sim_list:
#         ctx_list = get_ctx_by_word(cur, w, tgt)
#         cross_res.append({'word': w, 'sim': sim, 'ctx': process_ctx_list(ctx_list)})

[('wider', 0.8147139549255371),
 ('Chagrin', 0.8083021640777588),
 ('Falls', 0.8082398772239685),
 ('landscape', 0.7854062914848328),
 ('landmark', 0.7750949263572693),
 ('agenda', 0.7665138244628906),
 ('BAM', 0.7627230882644653),
 ('scientists', 0.7586003541946411),
 ('institutional', 0.7544519901275635),
 ('higher__education', 0.7536154985427856)]