In [5]:
!pip install easydict
!pip install ijson

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ijson
  Downloading ijson-3.2.0.post0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.5/112.5 KB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ijson
Successfully installed ijson-3.2.0.post0


In [6]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
import gensim.downloader
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import normalize
from sklearn import ensemble
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
from tqdm import tqdm
import math
import sys
import nltk
import pickle
import easydict
import requests
import ijson
import re

In [7]:
drive.mount('/content/drive', force_remount=True)
path = "/content/drive/MyDrive/Master Thesis/"
reference_space_path = "/content/drive/MyDrive/Master Thesis/Reference spaces/"
splits_path = path+"splits/"

Mounted at /content/drive


In [8]:
def train_test_split(name, test_size):
  train_size = 1 - test_size
  with open(splits_path+f"{name}_keys.txt", 'r', encoding="utf-8") as f:
        lines = f.readlines()
  test_n = int(len(lines) * test_size)
  indicies = set(random.sample(range(len(lines)), test_n))
  train = []
  test = []
  for i, line in enumerate(lines):
      if i in indicies: test.append(line)
      else: train.append(line)
  with open(splits_path+f"train_{name}.txt", 'w') as f:
    f.writelines(train)

  with open(splits_path+f"test_{name}.txt", 'w') as f:
    f.writelines(test)

def load_keys(filename, split=False, nltk_filter=None):
    with open(splits_path+filename if split else path+filename, "r") as f:
        lines = f.readlines()
    #random.shuffle(lines)
    keys = list(map(lambda line: line.replace("\n", "").lower(), lines))
    if nltk_filter is not None:
      nltk.download('averaged_perceptron_tagger')
      filtered_keys = []
      for key in keys:
        tag = nltk.pos_tag([key])
        if tag[0][1] in nltk_filter:
          filtered_keys.append(tag[0][0])
      return filtered_keys
    return set(keys)

#Biggraph

In [12]:
def load_biggraph(keysnames):
  name = "biggraph"  
  for keysname in keysnames:
    if os.path.exists(reference_space_path+f"{name}_{keysname}_reference_space.npy"):
      print(reference_space_path+f"{name}_{keysname}_reference_space.npy" + " exists.")
      continue
    print("Loading model")
    f = open(path+"wikidata_translation_v1_names.json", 'r', encoding="utf-8")
    objs = ijson.items(f, 'item')

    biggraph = np.load(path+'wikidata_translation_v1_vectors.npy', mmap_mode='r')

    all_keys = load_keys(f"biggraph_{keysname}_keys.txt", split=True)
    print(len(all_keys))
    
    aliasses = []
    idx = []
    print("Making reference space")
    for i, o in enumerate(objs):
      word = o.lower().replace("\"", "")
      word = re.sub("\@(.*)", "", word)
      if word in all_keys and word not in aliasses:
        aliasses.append(word)
        idx.append(i)
    reference_space = biggraph[idx]
    print(f"Size: {len(aliasses)}")

    reference_space = np.append(reference_space, np.array(aliasses).reshape(len(aliasses),1), axis=1)
    print("Writing file")
    with open(reference_space_path+f"{name}_{keysname}_reference_space.npy", "wb") as f:
        np.save(f, reference_space)

    s = ""
    for alias in aliasses:
      s += alias + "\n"
    with open(splits_path + f"{name}_{keysname}_keys.txt", "w") as f:
      f.write(s)

    train_test_split(f"{name}_{keysname}", 0.2)

In [13]:
load_biggraph(["20K", "names", "places", "20K_1_to_1_synsets", "20K_2_to_3_synsets", "20K_4_to_infinity_synsets"])

/content/drive/MyDrive/Master Thesis/Reference spaces/biggraph_20K_reference_space.npy exists.
/content/drive/MyDrive/Master Thesis/Reference spaces/biggraph_names_reference_space.npy exists.
/content/drive/MyDrive/Master Thesis/Reference spaces/biggraph_places_reference_space.npy exists.
Loading model
2000
Making reference space
Size: 2000
Writing file
Loading model
2000
Making reference space
Size: 2000
Writing file
Loading model
2000
Making reference space
Size: 2000
Writing file


#Load Graphvitre

In [14]:
def load_graphvite(name, keysnames):
  with open(path + "alias2entity.pickle", "rb") as fin:
    alias2entity = pickle.load(fin)

  embeddings_filename = f"{name}_wikidata5m.pkl"
  print("Loading model")
  with open(path + embeddings_filename, "rb") as fin:
      model = pickle.load(fin)

  for keysname in keysnames:
    if os.path.exists(reference_space_path+f"{name}_{keysname}_reference_space.npy"):
      print(reference_space_path+f"{name}_{keysname}_reference_space.npy" + " exists.")
      continue
    all_keys = load_keys(f"{name}_{keysname}_keys.txt", split=True)

    entity_embeddings = model.solver.entity_embeddings
    entity2id = model.graph.entity2id
    aliasses = list(filter(lambda key: (key in alias2entity) and (alias2entity[key] in entity2id), all_keys))

    print(f"Making reference space, size: {len(aliasses)}")
    reference_space = []
    for alias in aliasses:
      reference_space.append(entity_embeddings[entity2id[alias2entity[alias]]])

    reference_space = np.array(reference_space)
    reference_space = np.append(reference_space, np.array(aliasses).reshape(len(aliasses),1), axis=1)

    print("Saving files")
    with open(reference_space_path+f"{name}_{keysname}_reference_space.npy", "wb") as f:
      np.save(f, reference_space)

    s = ""
    for alias in aliasses:
      s += alias + "\n"
    with open(splits_path + f"{name}_{keysname}_keys.txt", "w") as f:
      f.write(s)

    train_test_split(f"{name}_{keysname}", 0.2)

In [15]:
load_graphvite("transe", ["20K", "names", "places", "20K_1_to_1_synsets", "20K_2_to_3_synsets", "20K_4_to_infinity_synsets"])
load_graphvite("complex", ["20K", "names", "places", "20K_1_to_1_synsets", "20K_2_to_3_synsets", "20K_4_to_infinity_synsets"])

Loading model
/content/drive/MyDrive/Master Thesis/Reference spaces/transe_20K_reference_space.npy exists.
/content/drive/MyDrive/Master Thesis/Reference spaces/transe_names_reference_space.npy exists.
/content/drive/MyDrive/Master Thesis/Reference spaces/transe_places_reference_space.npy exists.
Making reference space, size: 2000
Saving files
Making reference space, size: 2000
Saving files
Making reference space, size: 2000
Saving files
Loading model
/content/drive/MyDrive/Master Thesis/Reference spaces/complex_20K_reference_space.npy exists.
/content/drive/MyDrive/Master Thesis/Reference spaces/complex_names_reference_space.npy exists.
/content/drive/MyDrive/Master Thesis/Reference spaces/complex_places_reference_space.npy exists.
Making reference space, size: 2000
Saving files
Making reference space, size: 2000
Saving files
Making reference space, size: 2000
Saving files


#Load kgvec2go

In [None]:
def load_kgvec2go(keysnames):
  for keysname in keysnames:
    reference_space_filename = path+f"kgvec2go_{keysname}_embeddings.npy"
    keys_filename = splits_path + f"kgvec2go_{keysname}_keys.txt"
    if os.path.exists(reference_space_filename) and os.path.exists(keys_filename):
      continue

    all_keys = load_keys(f"{keysname}_keys.txt", split=True)
    
    embeddings = []
    keys_used = []
    for i in tqdm(range(len(all_keys))):
      key = all_keys[i]
      URL = f"http://www.kgvec2go.org/rest/get-vector/wiktionary/{key}"
      r = requests.get(url = URL)
      data = r.json()
      try:
        embeddings.append(data["vector"])
        keys_used.append(key)
      except:
        continue

    reference_space = np.array(embeddings)
    reference_space = np.append(reference_space, np.array(keys_used).reshape(len(keys_used),1), axis=1)

    with open(reference_space_filename, "wb") as f:
      np.save(f, reference_space)

    s = ""
    for key in keys_used:
      s += key + "\n"
    with open(keys_filename, "w") as f:
      f.write(s)

    train_test_split(f"kgvec2go_{keysname}", 0.2)

#Generate synset keys based on 20K words



In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
import random

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def count_synsets(word):
  return len(wn.synsets(word))

In [None]:
def count_synsets(word):
  return len(wn.synsets(word))

def synset_split(reference_space):
  splits = [(1,1),(2,3),(4,sys.maxsize)]
  n_sample = 2000
  all_keys = load_keys(f"{reference_space}_20K_keys.txt", split=True)

  synset_counter = {}
  for key in all_keys:
    n_synsets = count_synsets(key)
    if n_synsets not in synset_counter:
      synset_counter[n_synsets] = []
    synset_counter[n_synsets].append(key)

  key_ranges = {}
  for split in splits:
    range_key = str(split[0])+" to "+("infinity" if split[1] == sys.maxsize else str(split[1]))
    if range_key not in key_ranges:
      key_ranges[range_key] = []
    start = split[0]
    end = split[1]
    for key in synset_counter:
      if start <= key <= end:
        key_ranges[range_key].append(key)
  selected_keys = {}
  for key in key_ranges:
    seq = []
    for key2 in key_ranges[key]:
      seq += synset_counter[key2]
    assert(len(seq) == len(set(seq)))
    sample = random.sample(seq, n_sample)
    selected_keys[key] = sample

  for key in selected_keys:
    keys_used = selected_keys[key]
    s = ""
    for key2 in keys_used:
      s += key2 + "\n"
    key = key.replace(" ", "_")
    with open(splits_path + f"{reference_space}_20K_{key}_synsets_keys.txt", "w") as f:
      f.write(s)

    train_test_split(f"{reference_space}_20K_{key}_synsets", 0.2)

In [None]:
synset_split("biggraph")
synset_split("transe")
synset_split("complex")