#Set-up

In [None]:
%%capture
!pip install transformers
!pip install -U sentence-transformers
!pip install openai

In [None]:
import torch

import transformers
from transformers import BertTokenizer, BertModel
from transformers import GPT2Tokenizer, GPT2Model
from transformers import XLNetTokenizer, XLNetModel
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model
from transformers import AutoTokenizer, AutoModel
from transformers import GPTNeoXForCausalLM, AutoTokenizer # Pythia

from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import openai
import time
import os
from torch.nn.utils.rnn import pad_sequence
from sklearn.decomposition import PCA

In [None]:
ISDRIVE=False
local_path = ""
if ISDRIVE:
    drive.mount('/content/drive', force_remount=True)
    local_path = "/content/drive/MyDrive/Master Thesis/"

Mounted at /content/drive


In [None]:
models = {  'BERT' : ['google/bert_uncased_L-2_H-128_A-2',
                      'google/bert_uncased_L-4_H-256_A-4',
                      'google/bert_uncased_L-4_H-512_A-8',
                      'google/bert_uncased_L-8_H-512_A-8',
                      'google/bert_uncased_L-12_H-768_A-12'],
            'GPT-2': ['gpt2',
                      'gpt2-medium',
                      'gpt2-large',
                      'gpt2-xl'],
            'OPT'  : ['facebook/opt-125m',
                      'facebook/opt-350m',
                      'facebook/opt-1.3b',
                      'facebook/opt-2.7b',
                      'facebook/opt-6.7b'],
            "XLNet": ["xlnet-base-cased",
                      "xlnet-large-cased"],
            "Pythia" : ["EleutherAI/pythia-70m",
                        "EleutherAI/pythia-160m",
                        "EleutherAI/pythia-410m",
                        "EleutherAI/pythia-1b",
                        "EleutherAI/pythia-2.8b",
                        "EleutherAI/pythia-6.9b"]}
                      
def load_keys(filename):
    with open(local_path + "splits/"+filename, "r") as f:
        lines = f.readlines()
    keys = list(map(lambda line: line.replace("\n", "").lower(), lines))
    return keys

#BERT

In [None]:
def bert(PATH, CORPUS, reference_space_key, key_type_key, analogy = False):
  # create raw embeddings for all bert models using the full vocab (~170K words / ~14K words)
  # and more efficient setup with large batch-size

  names = ['BERT-Tiny', 'BERT-Mini', 'BERT-Small', 'BERT-Medium', 'BERT-Base']
  dims  = [128, 256, 512, 512, 768]

  for model_name, name, dim in zip(models['BERT'], names, dims):
    if analogy:
      embedding_name = f"{name}_analogy_embeddings.pkl"
    else:
      embedding_name = f"{name}_{reference_space_key}_{key_type_key}_embeddings.pkl"

    if os.path.exists(PATH + embedding_name):
      print(PATH + embedding_name + " already exists.")
      continue
    print(f"Creating embeddings for model: {name}")
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name, output_hidden_states=True)
    device = torch.device("cuda")
    model.to(device)
    model.eval()

    batch_size = 128
    embeddings = torch.zeros((len(CORPUS), dim))

    for i in range(0, len(CORPUS), batch_size):
        # Get batch of words and tokenize
        batch = CORPUS[i:i+batch_size]
        # add special tokens manually.
        tokens = tokenizer(batch, padding=True, return_tensors='pt', add_special_tokens=True).to(device)
        # Encode batch and calculate average embedding
        with torch.no_grad():
            # **tokens = (tokens_ids, segment_ids)
            outputs = model(**tokens)
            last_hidden_state = outputs.last_hidden_state
            # try MAX / MIN? use amax or amin
            avg_embedding = torch.mean(last_hidden_state, dim=1)

        # Store embeddings in tensor
        embeddings[i:i+batch_size] = avg_embedding

    # Convert tensor to numpy array
    embeddings = embeddings.detach().cpu().numpy()

    result = list(zip(CORPUS, embeddings))
    result = pd.DataFrame(result)
    result.columns = ['key', 'embedding']

    result.to_pickle(PATH + embedding_name)

##GPT-2

In [None]:
def gpt(PATH, CORPUS, reference_space_key, key_type_key, analogy=False):
  names = ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
  dims  = [768, 1024, 1280, 1600]

  for model_name, name, dim in zip(models['GPT-2'], names, dims):
    if analogy:
      embedding_name = f"{name}_analogy_embeddings.pkl"
    else:
      embedding_name = f"{name}_{reference_space_key}_{key_type_key}_embeddings.pkl"
    if os.path.exists(PATH + embedding_name):
      print(PATH + embedding_name + " already exists.")
      continue
    print("Generating embeddings for: ", name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2Model.from_pretrained(model_name)
    device = torch.device("cuda")
    model.to(device)
    model.eval()

    batch_size = 128
    embeddings = np.zeros((len(CORPUS), dim))

    # Process words in batches
    for i in range(0, len(CORPUS), batch_size):
        # Get batch of CORPUS and tokenize
        batch = CORPUS[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)

        # Encode batch and calculate average embedding
        with torch.no_grad():
            outputs = model(**tokens)
            last_hidden_state = outputs.last_hidden_state
            avg_embedding = torch.mean(last_hidden_state, dim=1).detach().cpu().numpy()

        # Store embeddings in tensor
        embeddings[i:i+batch_size] = avg_embedding
    result = list(zip(CORPUS, embeddings))
    result = pd.DataFrame(result)
    result.columns = ['key', 'embedding']
    result.to_pickle(PATH + embedding_name)

##Facebook/OPT

In [None]:
def opt(PATH, CORPUS, reference_space_key, key_type_key, analogy=False):
  names = ['opt-125m', 'opt-350m', 'opt-1.3b', 'opt-2.7b', 'opt-6.7b']
  dims  = [768, 512, 2048, 2560, 4096]

  for model_name, name, dim in zip(models['OPT'], names, dims):
    if analogy:
      embedding_name = f"{name}_analogy_embeddings.pkl"
    else:
      embedding_name = f"{name}_{reference_space_key}_{key_type_key}_embeddings.pkl"
    if os.path.exists(PATH + embedding_name):
      print(PATH + embedding_name + " already exists.")
      continue
      
    print(f"Creating embeddings using: {model_name} ({name})")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    device = torch.device("cuda")
    model.to(device)
    model.eval()

    batch_size = 128
    embeddings = np.zeros((len(CORPUS), dim))

    for i in range(0, len(CORPUS), batch_size):
        batch = CORPUS[i:i+batch_size]
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)

        with torch.no_grad():
            model_output = model(**tokens)
            last_hidden_state = model_output.last_hidden_state
            avg_embedding = torch.mean(last_hidden_state, dim=1).detach().cpu().numpy()

        embeddings[i:i+batch_size] = avg_embedding
    
    embeddings = embeddings

    result = list(zip(CORPUS, embeddings))
    result = pd.DataFrame(result)
    result.columns = ['key', 'embedding']
    result.to_pickle(PATH + embedding_name)

##OPENAI

In [None]:
def ada(PATH, CORPUS, reference_space_key, key_type_key, analogy=False):
  name = "ada-002"
  if analogy:
    embedding_name = f"{name}_analogy_embeddings.pkl"
  else:
    embedding_name = f"{name}_{reference_space_key}_{key_type_key}_embeddings.pkl"
  if os.path.exists(PATH + embedding_name):
      print(PATH + embedding_name + " already exists.")
  else:
    rerun=True
    responses = []
    if rerun:
      openai.api_key = "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"
      batch_size = 2000
      for i in range(0, len(CORPUS), batch_size):
        done = False
        while not done:
          try:
            responses += openai.Embedding.create(input=CORPUS[i:i+batch_size], model="text-embedding-ada-002")["data"]
            done = True
          except openai.error.RateLimitError as e:
            print("Sleeping...")
            time.sleep(10)
            print("Let's try again")

    embeddings = list(map(lambda row: row["embedding"], responses))

    result = list(zip(CORPUS, np.array(embeddings)))
    result = pd.DataFrame(result)
    result.columns = ['key', 'embedding']
    result.to_pickle(PATH + embedding_name)
      

##Pythia

In [None]:
def pythia(PATH, CORPUS, reference_space_key, key_type_key, analogy=False):
  names = ['pythia-70m', 'pythia-160m', 'pythia-410m', 'pythia-1b', 'pythia-2.8b', 'pythia-6.9b']
  dims  = [512, 768, 1024, 2048, 2560, 4096]

  for model_name, name, dim in zip(models['Pythia'], names, dims):
    if analogy:
      embedding_name = f"{name}_analogy_embeddings.pkl"
    else:
      embedding_name = f"{name}_{reference_space_key}_{key_type_key}_embeddings.pkl"
    if os.path.exists(PATH + embedding_name):
      print(PATH + embedding_name + " already exists.")
      continue
    print("Generating embeddings for: ", name)

    model = GPTNeoXForCausalLM.from_pretrained(
                  model_name,
                  revision="step143000",
                  cache_dir=f"./{name}/step143000",
                  output_hidden_states=True,
                  return_dict=True
                )

    tokenizer = AutoTokenizer.from_pretrained(
                      model_name,
                      revision="step143000",
                      cache_dir=f"./{name}/step143000",
                    )
    
    tokenizer.pad_token = tokenizer.eos_token
    device = torch.device("cuda")
    model.to(device)
    model.eval()

    batch_size = 128
    embeddings = np.zeros((len(CORPUS), dim))
          
    # Process words in batches
    for i in range(0, len(CORPUS), batch_size):
      # Get batch of CORPUS and tokenize
      batch = CORPUS[i:i+batch_size]
      tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt').to(device)

      # Encode batch and calculate average embedding
      with torch.no_grad():
          outputs = model(**tokens)
          #outputs[2] = hidden_states, and [-1] is the last hidden state.
          avg_embedding = torch.mean(outputs[2][-1], dim=1).detach().cpu().numpy()

      # Store embeddings in tensor
      embeddings[i:i+batch_size] = avg_embedding

    result = list(zip(CORPUS, embeddings))
    result = pd.DataFrame(result)
    result.columns = ['key', 'embedding']
    result.to_pickle(PATH + embedding_name)

#Run all

In [None]:
def run_all(reference_space_key, key_type_key, analogy=False):
  if not analogy:
    # LOAD 14K common words
    CORPUS = load_keys(f"{reference_space_key}_{key_type_key}_keys.txt")
    PATH  = f"{}Glocal_patheneration of embeddings + experiments/Data/Regular embeddings/"
    
  else:
    print("Loading analogies")

    PATH = f"local_path{}analogy_contexts_splits/analogy_all_en_contexts.csv.train"

    def read_dataset(path):
      splitter = ";"
      data_length = 14
      with open(path, 'r') as f:
        lines = f.readlines()
      items = []
      for i in range(1, len(lines), 2):
        data = lines[i].split(splitter)
        assert(len(data) == data_length)
        item1 = data[0]
        item2 = data[3]
        item3 = data[6]
        item4 = data[9]
        items.append((item1, item2, item3, item4))
      return items

    data = read_dataset(PATH)

    one_word_relations = []

    for d in data:
      flag = True
      for item in d:
        i = item.split(' ')
        if len(i) == 1:
          continue
        else:
          flag = False
      if flag == True:
        one_word_relations.append(d)

    CORPUS = []
    for relation in one_word_relations:
      for word in relation:
        CORPUS.append(word)

    PATH  = f"{local_path}Generation of embeddings + experiments/Data/analogy embeddings/"
  
  bert(PATH, CORPUS, reference_space_key, key_type_key, analogy=analogy)
  #xlnet(PATH, CORPUS, reference_space_key, key_type_key, analogy=analogy)
  gpt(PATH, CORPUS, reference_space_key, key_type_key, analogy=analogy)
  #t5(PATH, CORPUS, reference_space_key, key_type_key, analogy=analogy)
  opt(PATH, CORPUS, reference_space_key, key_type_key, analogy=analogy)
  ada(PATH, CORPUS, reference_space_key, key_type_key, analogy=analogy)
  pythia(PATH, CORPUS, reference_space_key, key_type_key, analogy=analogy)


In [None]:
analogy = False
if analogy:
  run_all(None, None, analogy=analogy)
else:
  reference_spaces = ["biggraph", "transe", "complex"]
  key_type_keys = ["20K", "places", "names", "20K_1_to_1_synsets", "20K_2_to_3_synsets", "20K_4_to_infinity_synsets"]

  for reference_space in reference_spaces:
    for key_type_key in key_type_keys:
      run_all(reference_space, key_type_key, analogy=analogy)

Loading analogies
Creating embeddings for model: BERT-Tiny


Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Creating embeddings for model: BERT-Mini


Some weights of the model checkpoint at google/bert_uncased_L-4_H-256_A-4 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Creating embeddings for model: BERT-Small


Some weights of the model checkpoint at google/bert_uncased_L-4_H-512_A-8 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Creating embeddings for model: BERT-Medium


Some weights of the model checkpoint at google/bert_uncased_L-8_H-512_A-8 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Creating embeddings for model: BERT-Base


Some weights of the model checkpoint at google/bert_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/analogy embeddings/gpt2_analogy_embeddings.pkl already exists.
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/analogy embeddings/gpt2-medium_analogy_embeddings.pkl already exists.
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/analogy embeddings/gpt2-large_analogy_embeddings.pkl already exists.
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/analogy embeddings/gpt2-xl_analogy_embeddings.pkl already exists.
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/analogy embeddings/opt-125m_analogy_embeddings.pkl already exists.
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/analogy embeddings/opt-350m_analogy_embeddings.pkl already exists.
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/analogy embeddings/opt-1.3b_ana

#PCA

In [None]:
def transform_pca():
  directory = "/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/Regular embeddings/"
  pca_directory = "/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/PCA embeddings/"

  for filename in os.listdir(directory):
    if "transe" in filename or "complex" in filename:
      target_d = 512
    elif "biggraph" in filename:
      target_d = 200
    else:
      raise Exception("Akward... dit not expect this.")
    pca_filename = pca_directory + filename.replace("_embeddings", "_pca_embeddings")
    if os.path.exists(pca_filename):
      print(f"{pca_filename} already exists.")
      continue
    f = os.path.join(directory, filename)
    embeddings_data = np.load(f, allow_pickle=True).to_numpy()
    keys = []
    embeddings = []
    for row in embeddings_data:
      keys.append(row[0])
      embeddings.append(row[1])
    keys = np.array(keys)
    embeddings = np.array(embeddings)

    if embeddings.shape[1] >= target_d:
      pca = PCA(n_components=target_d)
      reduced_embeddings = pca.fit_transform(embeddings)

      result = list(zip(keys, reduced_embeddings))
      result = pd.DataFrame(result)
      result.columns = ['key', 'embedding']
      result.to_pickle(pca_filename)
      print(f"Transformed: {filename}, {reduced_embeddings.shape}")

In [None]:
transform_pca()

pythia-70m_biggraph_20K_embeddings.pkl
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/PCA embeddings/pythia-70m_biggraph_20K_pca_embeddings.pkl already exists.
pythia-160m_biggraph_20K_embeddings.pkl
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/PCA embeddings/pythia-160m_biggraph_20K_pca_embeddings.pkl already exists.
pythia-410m_biggraph_20K_embeddings.pkl
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/PCA embeddings/pythia-410m_biggraph_20K_pca_embeddings.pkl already exists.
pythia-1b_biggraph_20K_embeddings.pkl
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/PCA embeddings/pythia-1b_biggraph_20K_pca_embeddings.pkl already exists.
pythia-2.8b_biggraph_20K_embeddings.pkl
/content/drive/MyDrive/Master Thesis/Generation of embeddings + experiments/Data/PCA embeddings/pythia-2.8b_biggraph_20K_pca_embeddings.pkl already exists.
pythia-6.9b_biggraph_20K_e