In [1]:
%%capture
import pandas as pd
import numpy as np
%pip install ijson
import ijson
import gensim.downloader       
import random
from sklearn.decomposition import PCA
from scipy.linalg import orthogonal_procrustes
%pip install transformers
from transformers import AutoTokenizer, GPT2Model
import torch
%pip install sentence-transformers
from sentence_transformers import SentenceTransformer
import torch
import transformers
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import LinearRegression
import math

In [2]:
ISDRIVE=False
path = ""
if ISDRIVE:
    drive.mount('/content/drive', force_remount=True)
    path = "/content/drive/MyDrive/Master Thesis/"
    
log_path = path + "Logs/"
splits_path = path+"splits/"
reference_space_path = path + "Reference spaces/"
pca_embeddings_path = path + "Generation of embeddings + experiments/Data/PCA embeddings/"
cache_path = path+"cache/"
splitter = "; "
splitter2 = ": "

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

Mounted at /content/drive


In [3]:
def reset_log(log_filename):
  with open(log_filename, "w") as f:
    f.write("")

def write_line(obj, log_filename):
  s = ""
  counter = 0
  n = len(obj.keys())
  for key in obj:
    counter += 1
    s += f"{key}{splitter2}{obj[key]}"
    if counter != n:
      s += splitter
  s += "\n"
  with open(log_filename, 'a') as f:
    f.write(s)

def exists_in_log(models, ks, log_filename):
  with open(log_filename, "r") as f:
    content = f.readlines()
  models_target = set()
  ks_target = set()
  for line in content:
    line = line.replace("\n", "").split(splitter)
    assert(len(line) == 4)
    mapper = {}
    for elm in line:
      elm = elm.split(splitter2)
      assert(len(elm) == 2)
      key, value = elm[0], elm[1]
      mapper[key] = value
    assert("model" in mapper and "k" in mapper)
    if mapper["model"] in models:
      models_target.add(mapper["model"])
      ks_target.add(int(mapper["k"]))
  models = set(models)
  ks = set(ks)
  return models == models_target and ks == ks_target

In [12]:
def load_keys(filename):
    with open(splits_path+filename, "r") as f:
        lines = f.readlines()
    keys = list(map(lambda line: line.replace("\n", ""), lines))
    return keys

def load_lm_embeddings(model, keys, reference_space_key, key_type_key):
  embeddings = np.load(pca_embeddings_path + f"{model}_{reference_space_key}_{key_type_key}_pca_embeddings.pkl", allow_pickle=True).to_numpy()
  rows = []
  indexes = []
  added = set()
  for row in embeddings:
    key = row[0]
    if "[CLS]" in key:
      key = key.replace("[CLS] ", "")
    if "[SEP]" in key:
      key = key.replace(" [SEP]", "")
    if key in keys and key not in added:
      if isinstance(row[1], list):
        rows.append(np.array(row[1]).astype(float))
      else:
        rows.append(np.array(row[1].astype(float)))
      indexes.append(key)
      added.add(key)
  rows = np.array(rows)

  indexes = np.array(indexes)
  df = pd.DataFrame(rows, index=indexes)
  return df

def load_embeddings(model, filename, reference_space_key, key_type_key, clear_cache=False):
  cache_key = filename.replace(".txt", f"{model}_{reference_space_key}_{key_type_key}_embeddings_procrustes_cache.pkl")
  if not clear_cache:
    cache = try_load_df_cache(cache_key)
    if cache is not None:
      return cache

  keys = load_keys(filename)
  retval = None
  if model in ["BERT-Tiny", "BERT-Mini", "BERT-Small", "BERT-Medium", "BERT-Base", 'pythia-70m', 'pythia-160m', 'pythia-410m', 'pythia-1b', 'pythia-2.8b', 'pythia-6.9b', "XLNet-base", "XLNet-large", "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "ada-002", "t5-small", "t5-base", "t5-large", "t5-3b", "opt-125m", "opt-350m", "opt-1.3b", "opt-2.7b", "opt-6.7b"]:
    retval = load_lm_embeddings(model, keys, reference_space_key, key_type_key)
  if retval is None:
    raise Exception("Unknown model")
  retval = retval.sort_index()
  write_to_df_cache(retval, cache_key)
  return retval

def try_load_df_cache(filename):
  assert(".pkl" in filename)
  if os.path.exists(cache_path + filename):
    return pd.read_pickle(cache_path + filename)
  return None

def write_to_df_cache(df, filename):
  df.to_pickle(cache_path + filename) 

def load_reference_space(filename, reference_space_key, key_type_key, clear_cache=False):
  cache_key = filename.replace(".txt", f"{reference_space_key}_{key_type_key}_reference_space_procrustes_cache.pkl")
  if not clear_cache:
    cache = try_load_df_cache(cache_key)
    if cache is not None:
      return cache

  keys = load_keys(filename)
  reference_space_filename = f'{reference_space_key}_{key_type_key}_reference_space.npy'
  X = np.load(reference_space_path + reference_space_filename, mmap_mode='r')
  rows = []
  indexes = []
  added = set()
  n_cols = X.shape[1]
  for row in X:
    if row[n_cols-1] in keys and row[n_cols-1] not in added:
      rows.append(np.array(row[:n_cols-1].astype(float)))
      indexes.append(row[n_cols-1])
      added.add(row[n_cols-1])

  rows = np.array(rows)

  indexes = np.array(indexes)
  df = pd.DataFrame(rows, index=indexes)
  df = df.sort_index()
  write_to_df_cache(df, cache_key)
  return df

def cosine(a,b):
  cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
  return cos_sim

def precision_at_k_analysis(source, target, k):
  n = source.shape[0]
  corrects_cosine = []
  corrects_euclidian = []

  distances_euclidian = euclidean_distances(source, target)
  distances_euclidian = pd.DataFrame(distances_euclidian, index=source.index, columns=source.index)
  for key in distances_euclidian.index:
    distances = distances_euclidian.loc[key].sort_values(ascending=True)[:k]
    corrects_euclidian.append(1 if key in distances.index else 0)

  distances_cosine = cosine_similarity(source.values, target.values)
  distances_cosine = pd.DataFrame(distances_cosine, index=source.index, columns=source.index)
  for key in distances_cosine.index:
    distances = distances_cosine.loc[key].sort_values(ascending=False)[:k]
    corrects_cosine.append(1 if key in distances.index else 0)

  return sum(corrects_cosine) / len(corrects_cosine), sum(corrects_euclidian) / len(corrects_euclidian)

def translate(df):
  indexes = df.index
  arr = df.to_numpy()
  arr -= np.mean(arr, 0)
  arr /= np.linalg.norm(arr)
  df = pd.DataFrame(arr, index = indexes)
  return df

def transform(df, R, sca):
  return df.dot(R) * sca

def cosine_test(a,b):
  return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))


#Experiments

In [13]:
bert_models = ["BERT-Tiny", "BERT-Mini", "BERT-Small", "BERT-Medium", "BERT-Base"]
pynthia_models = ['pythia-70m', 'pythia-160m', 'pythia-410m', 'pythia-1b', 'pythia-2.8b', 'pythia-6.9b']
t5_models = ["t5-small", "t5-base", "t5-large", "t5-3b"]
opt_models = ["opt-125m", "opt-350m", "opt-1.3b", "opt-2.7b", "opt-6.7b"]
gpt_models = ['gpt2','gpt2-medium','gpt2-large', "gpt2-xl", "ada-002"]
XLNet_models = ["XLNet-base", "XLNet-large"]
all_models = [bert_models, opt_models, gpt_models, pynthia_models]
#all_models = [pynthia_models]
ks = [1,10,20,50]

In [14]:
def run_experiment(models, train_filename, test_filename, reference_space_key, key_type_key, log_filename):
  if not exists_in_log(models, ks, log_filename):
    print(f"Experiment for: models: {models}")
    print("Loading referfence space")
    clear_cache=True
    y_train, y_test = load_reference_space(train_filename, reference_space_key, key_type_key, clear_cache=clear_cache), load_reference_space(test_filename, reference_space_key, key_type_key, clear_cache=clear_cache)

    for model in models:
      lm_filepath = pca_embeddings_path + f"{model}_{reference_space_key}_{key_type_key}_pca_embeddings.pkl"
      if not os.path.exists(lm_filepath):
        print(f"Skipping {model}")
        continue

      print(f"Running model: {model}")
      print("Loading embeddings")
      X_train, X_test = load_embeddings(model, train_filename, reference_space_key, key_type_key, clear_cache=clear_cache), load_embeddings(model, test_filename, reference_space_key, key_type_key, clear_cache=clear_cache)
      print(f"Train size: {len(X_train.index.to_list())}, test_size: {len(X_test.index.to_list())}")
      assert(len(set(X_train.index.to_list())) == len(X_train.index.to_list()))
      assert(len(set(X_test.index.to_list())) == len(X_test.index.to_list()))
      
      print("Doing procrustes")
      #print(X_train.shape, y_train.shape)
      translate(y_train)            
      translate(X_train)
      print(X_train.shape, y_train.shape)
      R, sca = orthogonal_procrustes(X_train, y_train)

      translate(y_test)
      translate(X_test)
      y_test_pred = transform(X_test, R, sca)

      print("Evaluating")
      for k in ks:
        test_acc_cosine, test_acc_euclidian = precision_at_k_analysis(y_test_pred, y_test, k)
        write_line({"model": model, "k": k, "test P@K cosine": test_acc_cosine, "test P@K euclidian": test_acc_euclidian}, log_filename)
  else:
    print(f"Already exists in log: models: {reference_space_key}, {key_type_key}, {models}")

In [15]:
def run_all(reference_space_key, key_type_key, append=False):
  log_filename = log_path + f"log_procrustes_{reference_space_key}_{key_type_key}.txt"
  test_filename = f"test_{reference_space_key}_{key_type_key}.txt"
  train_filename = f"train_{reference_space_key}_{key_type_key}.txt"

  if not append and os.path.exists(log_filename):
    print(f"Experiment data exists: {reference_space_key}, {key_type_key}")
  else:
    if not append:
      reset_log(log_filename)
    for models in all_models:
      run_experiment(models, train_filename, test_filename, reference_space_key, key_type_key, log_filename)

In [16]:
reference_spaces = ["biggraph", "transe", "complex"]
key_type_keys = ["20K", "places", "names", "20K_1_to_1_synsets", "20K_2_to_3_synsets", "20K_4_to_infinity_synsets"]

for reference_space in reference_spaces:
  for key_type_key in key_type_keys:
    try:
      run_all(reference_space, key_type_key, append=True)
    except Exception as e:
      print(key_type_key, reference_space)
      raise e

Experiment for: models: ['BERT-Tiny', 'BERT-Mini', 'BERT-Small', 'BERT-Medium', 'BERT-Base']
Loading referfence space
Skipping BERT-Tiny
Running model: BERT-Mini
Loading embeddings
Train size: 11711, test_size: 2927
Doing procrustes
(11711, 200) (11711, 200)
Evaluating
Running model: BERT-Small
Loading embeddings
Train size: 11711, test_size: 2927
Doing procrustes
(11711, 200) (11711, 200)
Evaluating
Running model: BERT-Medium
Loading embeddings
Train size: 11711, test_size: 2927
Doing procrustes
(11711, 200) (11711, 200)
Evaluating
Running model: BERT-Base
Loading embeddings
Train size: 11711, test_size: 2927
Doing procrustes
(11711, 200) (11711, 200)
Evaluating
Already exists in log: models: biggraph, 20K, ['opt-125m', 'opt-350m', 'opt-1.3b', 'opt-2.7b', 'opt-6.7b']
Already exists in log: models: biggraph, 20K, ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'ada-002']
Already exists in log: models: biggraph, 20K, ['pythia-70m', 'pythia-160m', 'pythia-410m', 'pythia-1b', 'pythia-2.8