In [1]:
# This is Oliver's second attempt at sort of baseline model to predict colexification in CLICS.
# Here, I was trying to predict p(second word is b | first word is a) in a "randomly selected" pair of two coelexified senses. 
# More specifically, I'm finding the "target" p(b | a) by assuming this is proportional to the colexification frequency of a and b for senses a and b
# Here I'm trying to predict the probability distributions p(? | a) from a sense a, where the target is the distribution described the line above. I'm doing this with a neural network.
# I use KL Divergence to measure the loss between the predicted and "actual" dsitributions
# (Doesn't matter if using this outside of colab) Click "show code" to show code and double click on the white bar to the left of code to hide it

In [None]:
#@title
from google.colab import drive
! [ -e /content ] && pip install -Uqq fastbook
! pip install torch-lr-finder
! pip install -U sentence-transformers
import fastbook
fastbook.setup_book()
from fastai.tabular.all import *

In [3]:
#@title
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from fastbook import *
import gensim.downloader as gs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
import random as r
from torch_lr_finder import LRFinder
from scipy import stats

In [4]:
#@title
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('whaleloops/phrase-bert')

Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/670 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
wv = gs.load('word2vec-google-news-300') # These are the word2vec embeddings we are using

In [5]:
#@title
NUM_LANGUAGE = 3156 # This is the number of languages used in the CLICS database
df = pd.read_csv("gdrive/MyDrive/clics-colexification-data.csv") # Change path as needed
df.head()

Unnamed: 0,colex.freq,Concepticon_Gloss.xo,Concepticon_Gloss.yo,vision,assoc,affec,tax,fully_covered
0,340,TREE,WOOD,1,1,1,1,1
1,326,LEG,FOOT,1,1,1,1,1
2,296,MOON,MONTH,0,1,1,1,0
3,291,GO,WALK,0,1,1,1,0
4,284,HAND,ARM,1,1,1,1,1


In [6]:
#@title
dumb_british_spellings = {"armour":"armor", "grey":"gray", "mould":"mold", "neighbour":"neighbor", "axe":"ax", "moustache":"mustache", "plough":"plow", "mandarine":"mandarin"}
obscure_words = {"shoulderblade":"shoulder blade", "spearthrower":"spear thrower", "ridgepole":"ridge pole", "pimpleface":"pimple face", "tumpline":"backpack", "cushma":"clothing", "curassow":"tropical bird", "banisterium":"plant", "paca":"rodent", "netbag":"net bag", "muntjacs":"barking deer"}

# This converts a sense to its word2vec embeddings. More specifically, for each sense I remove punctuation and then add the vectors of each individual word. 
# I also replace some obscure words and british spellings not recognized by word2vec to similar phrases that word2vec can recognize
def stringToVec(s):  
  s = s.lower()
  for i in "(),\t\n":
    s = s.replace(i, "")
  for i in "-":
    s = s.replace(i, " ")
  for i in dumb_british_spellings:
    s = s.replace(i, dumb_british_spellings[i])
  for i in obscure_words:
    s = s.replace(i, obscure_words[i])
  
  words = s.split(" ")
  vec_defined = False
  num_words = 0
  for i in words:
    try:      # try except gives an error if we can't convert the 
      if vec_defined:
        vec += np.array(wv[i])
      else:
        vec = np.array(wv[i])
        vec_defined = True
      num_words += 1
    except:
      continue
  if not vec_defined:
    raise Exception("Word cannot be converted to vector")
  return np.array(vec)/num_words #This will return the string "error" if there is not 

# Converts a number or array to a tensor of floats
def toTensor(arr):
  return torch.tensor(arr, dtype = torch.float32) #Function to easily convert arrays to tensors

In [7]:
#@title
embed_length = len(model.encode("yeet")) # Length of embeddings we are using.

vec_dic = {} # This will be a dictionary that easily allows us to access the embedding for all of our senses, saving time. 
error_senses = set()  # This represents the set of senses for which there was a problem converting them to embeddings. 

for i in range(len(df)): # Here we loop through each row of our dataframe, and if we can convert a sense s to an embedding then we set vec_dic[s] = embedding
  row = df.iloc[i]
  x = row["Concepticon_Gloss.xo"]
  y = row["Concepticon_Gloss.yo"]

  try:   
    if x not in vec_dic:
      xvec = np.array(model.encode(x))
      vec_dic[x] = xvec
  except:
    error_senses.add(x)

  try:  
    if y not in vec_dic:
      yvec = np.array(model.encode(y))
      vec_dic[y] = yvec
  except: 
    error_senses.add(y)

error_senses = list(error_senses) # List of all senses that could not be converted to embeddings. Should be empty right now with phrase BERT
senses = list(vec_dic.keys()) # List of all concepts

sense_indices = {senses[i]:i for i in range(len(senses))} # sense_indices is a dictionary where its keys are senses and its values are the indices for which the senses appear in our list of senses.

In [8]:
#@title
# In this cell we set up the dependent variables - probability distributions p(?|s) for each sense s, where p(a|b) is (roughly) proportional to how frequently they are colexified in our data set.  

# Computes cosine between two vectors
def getSimilarity(vec, vec2): 
  x = vec
  y = vec2
  return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

# I added the option to add a small epsilon term, so the probabilities p(b|a) are not actually proportional to the colexification frequency
# of a and b, but are proportional to colex.freq + epsilon. The reason for this is if a and b don't appear as colexified in our 
# data set, there is probably a very small but not actually zero probability they colexify
# Thus, this epsilon term would "smooth" our probability distributions
epsilon = 0.00000

amts = {} # Measures total frequency each sense colexifies, i.E., amts[s] is sum of colex.freq(s, i) over all senses i
senseToFreq = {} # Dictionary that returns the frequency of each sense
senseToProbVec = {} # This is a dictionary with keys of senses a and values of lists representing the probability distributions p(?|a). More specifically, if sense_indices[s] = i, then the ith element of senseToProbVec[a] is p(s|a)

for s in senses: # Initializes each dictionary with an empty array or 0 for each sense
  senseToFreq[s] = []
  senseToProbVec[s] = []
  amts[s] = 0
  for s2 in senses:
    senseToFreq[s].append(0)
    senseToProbVec[s].append(epsilon)

for i in range(len(df)): # Here we initialize the vectors senseToProvVec[a] so that their ith element is colex.freq of a and the ith sense + epsilon. 
  row = df.iloc[i]
  x = row["Concepticon_Gloss.xo"]
  y = row["Concepticon_Gloss.yo"]
  amt = int(row["colex.freq"])
  if x in vec_dic and y in vec_dic:
    senseToProbVec[x][sense_indices[y]] += amt
    senseToProbVec[y][sense_indices[x]] += amt 
    senseToFreq[x][sense_indices[y]] += amt
    senseToFreq[y][sense_indices[x]] += amt 
    amts[x] += amt
    amts[y] += amt
for s in senses:
  senseToProbVec[s] = np.array(senseToProbVec[s])
  senseToProbVec[s] = toTensor(senseToProbVec[s]/sum(senseToProbVec[s])) # Here we normalize the vectors to have sums 1

print("Amount each sense appears: ", amts[0:5])
print("Probability vector for TREE: ", senseToProbVec["TREE"])

Amount each sense appears:  {'TREE': 1022, 'WOOD': 961, 'LEG': 822, 'FOOT': 622, 'MOON': 568, 'MONTH': 438, 'GO': 919, 'WALK': 971, 'HAND': 656, 'ARM': 659, 'WOMAN': 724, 'WIFE': 697, 'OLD': 658, 'OLD (AGED)': 450, 'SON-IN-LAW (OF WOMAN)': 414, 'SON-IN-LAW (OF MAN)': 410, 'SPEAK': 1185, 'TALK': 363, 'LEATHER': 640, 'SKIN': 1009, 'KNIFE': 452, 'KNIFE (FOR EATING)': 334, 'MEAT': 722, 'FLESH': 503, 'DAUGHTER-IN-LAW (OF WOMAN)': 378, 'DAUGHTER-IN-LAW (OF MAN)': 405, 'CLAW': 541, 'FINGERNAIL': 564, 'SAY': 1060, 'LAND': 761, 'COUNTRY': 594, 'BARK': 533, 'CARRY ON BACK': 293, 'CARRY': 1144, 'BLUE': 372, 'GREEN': 492, 'PERSON': 560, 'MAN': 836, 'HOW MUCH': 266, 'HOW MANY PIECES': 258, 'FATHER-IN-LAW (OF WOMAN)': 352, 'FATHER-IN-LAW (OF MAN)': 357, 'MOTHER-IN-LAW (OF WOMAN)': 333, 'MOTHER-IN-LAW (OF MAN)': 347, 'EARTH (SOIL)': 1049, 'RIVER': 580, 'WATER': 801, 'HIS OR HER': 390, 'HIS (GENITIVE OF HE)': 361, 'LIE DOWN': 619, 'SLEEP': 722, 'HUSBAND': 661, 'BOAT': 435, 'CANOE': 403, 'DAUGHTER': 44

In [9]:
#@title
# In this cell we set up slightly different probability distributions. The only difference is that we initialize the frequencies of each pair of senses (before they are normalized) to be 
# small and proportional to similarity. This produces better rankings of likelihood of senses for senses with low predicted probability.

senseToProbVecSim = {} # This is a dictionary for the slightly different target distributions
for s in senses:
  senseToProbVecSim[s] = []
  for s2 in senses:
    vec1 = np.array(vec_dic[s])
    vec2 = np.array(vec_dic[s2])
    sim = getSimilarity(vec1, vec2)
    senseToProbVecSim[s].append(sim)
  senseToProbVecSim[s] = np.array(senseToProbVecSim[s])
  senseToProbVecSim[s] = senseToProbVecSim[s]/(sum(senseToProbVecSim[s]))

for i in range(len(df)): 
  row = df.iloc[i]
  x = row["Concepticon_Gloss.xo"]
  y = row["Concepticon_Gloss.yo"]
  amt = int(row["colex.freq"])
  if x in vec_dic and y in vec_dic:
    senseToProbVecSim[x][sense_indices[y]] += amt
    senseToProbVecSim[y][sense_indices[x]] += amt 
for i in senses:
  senseToProbVecSim[i] = np.array(senseToProbVecSim[i])
  senseToProbVecSim[i] = toTensor(senseToProbVecSim[i]/sum(senseToProbVecSim[i])) 

In [10]:
# This is our neural network. 
# This model recieves an input of a vector representing a word or phrase s in word2vec.
# It outputs a vector predicting the probability distributions p(?|s) 
# We output the logs of the probabilities instead of the probabilities since these are used in our KL Divergence loss function.

colex_model_2 = nn.Sequential(
    nn.Linear(embed_length, 250),
    nn.Sigmoid(),
    nn.Linear(250, len(senses)), 
    #nn.Dropout(p=0.1), 
    nn.LogSoftmax()
)
colex_model_2.load_state_dict(torch.load("gdrive/MyDrive/models/ColexificationKLModel.pt"))

<All keys matched successfully>

In [None]:
# In this cell we set up batches of training and test data
needed_amt = 40
input_output_pairs = list([s for s in senses if amts[s] > needed_amt])
train, test = train_test_split(input_output_pairs, test_size = 0.1, random_state = 0)

train_prob_dic = senseToProbVecSim #This is how we're getting the probability distributions for the training data. I split them up in case changing the target for test data leads to better performance on the target for training data.
test_prob_dic = senseToProbVec #This is how we're getting the probability distributions for the test data.

X_train = torch.stack([toTensor(vec_dic[s]) for s in train]) # Tensor of training inputs
y_train = torch.stack([train_prob_dic[s] for s in train]) # Tensor of training outputs
X_test = torch.stack([toTensor(vec_dic[s]) for s in test]) # Tensor of test inputs
y_test = torch.stack([test_prob_dic[s] for s in test]) # Tensor of test outputs

trainDL = DataLoader(TensorDataset(X_train,y_train), batch_size = 512) # Train and test dataloaders
testDL = DataLoader(TensorDataset(X_test,y_test), batch_size = 512)
dls = DataLoaders(trainDL, testDL)


# Learner is a class in FastAI that does all our model training steps for us
# To compare the difference between the probability distributions p(?|a) given by our csv file and the one our model outputs, we use Kullback Leibler divergence
# This measures the "distance" between two probability distributions

learner = Learner(dls, colex_model_2, loss_func = nn.KLDivLoss(reduction = "batchmean"))

In [None]:
# lr_find().valley finds the optimal learning rate
# fit_one_cycle fits according to the one-cycle policy, which uses things such as cosine annealing and momentum

alpha = learner.lr_find().valley
#alpha = 0.0001
learner.fit_one_cycle(1, alpha)
torch.save(colex_model_2.state_dict(), "gdrive/MyDrive/ColexificationKLModel2.pt") 

In [None]:
# saves the model
colex_model_2.eval() # Switches to evaluation mode

In [None]:
# Model 1 is a regression model that predicts colex.freq given sense embeddings. I trained this in a separate notebook.

colex_model_1 = nn.Sequential(
    nn.Linear(2*embed_length, 250),
    nn.Sigmoid(),
    nn.Linear(250, 1)
)
colex_model_1.load_state_dict(torch.load("gdrive/MyDrive/models/ColexificationRegModel.pt")) # Loads already trained model
colex_model_1.eval() # Switches to evaluaiton mode

In [None]:
#@title

reg_preds = {} # Predicted output of regression model for test senses. This returns an array where the ith element of reg_preds[s] corresponds to the output of senses s and the ith sense  
kl_preds = {}  # Predicted output of kl divergence model for test senses
w2v_preds = {} # word2vec similarities with every other sense for test senses
bert_preds = {} # phrase BERT similarities with every other sense for test senses

# Helper function that finds the colexification frequency of (s, a) for each other sense a predicted by our regression model
def getFreqPred(s):
  if s in reg_preds:
    return reg_preds[s]
  dist = []
  for s2 in senses:
    dist.append(colex_model_1(torch.concat((toTensor(vec_dic[s]), toTensor(vec_dic[s2])))).data)
  dist = np.array(dist)
  reg_preds[s] = dist.copy()
  return dist

# Gives a ranking of senses that are predicted to colexify with a given sense the most based on regression model
def getRegRanking(sense):    
  dist = getFreqPred(sense)
  probs_senses = [(math.exp(float(dist[i].data)), senses[i]) for i in range(len(senses))]
  probs_senses.sort(reverse = True)
  return [i[1] for i in probs_senses]

# Gives a ranking of the senses that have highest probability of colexifying with a sense in our conditional model
def getKLRanking(sense):    
  dist = colex_model_2(toTensor(vec_dic[sense]))
  probs_senses = [(math.exp(float(dist[i].data)), senses[i]) for i in range(len(senses))]
  probs_senses.sort(reverse = True)
  return [i[1] for i in probs_senses]

# Gives a ranking of the most similar senses to a given sense in word2vec
def getW2VSimilarityRanking(sense):
  dic = {}
  x = np.array(stringToVec(sense))
  for s in senses:
    try:
      y = np.array(stringToVec(s))
      dic[s] = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
    except:
      dic[s] = 0
  arr = [(dic[s], s) for s in senses]
  arr.sort(reverse = True)
  return [i[1] for i in arr]

# Gives a ranking of the most similar senses to a given sense in Phrase-BERT
def getBERTSimilarityRanking(sense):
  dic = {}
  x = np.array(vec_dic[sense])
  for s in senses:
    try:
      y = np.array(vec_dic[s])
      dic[s] = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
    except:
      dic[s] = 0
  arr = [(dic[s], s) for s in senses]
  arr.sort(reverse = True)
  return [i[1] for i in arr]

# Gives a ranking of senses that actually colexify with a given sense the most
def getActualRanking(sense):
  vec = senseToProbVec[sense]
  arr = [(vec[i], senses[i]) for i in range(len(senses))]
  arr.sort(reverse = True)
  return [i[1] for i in arr]

# Computes the average difference between the indices for which the top n senses appear in an actual ranking of senses and where they appear in a predicted ranking of senses
# Which measures a sort of loss of the predicted ranking
def getRankDiff(actual, pred, n):
  dic_actual = {actual[i]:i for i in range(len(actual))}
  dic_pred = {pred[i]:i for i in range(len(actual))}
  avg_diff = 0
  for i in range(n):
    avg_diff += abs(dic_actual[actual[i]]-dic_pred[actual[i]])
  return avg_diff/n

# Outputs a probability distribution p(?|sense) proportional to predicted colexification frequency in first model
def getRegDistribution(sense):
  freqs = np.array([float(i) for i in list(getFreqPred(sense))])
  return toTensor(np.log(freqs/sum(freqs)))

def getKLDistribution(sense): # Outputs a predicted probability distribution given by second model
  if sense in kl_preds:
    return kl_preds[sense]
  pred = colex_model_2(toTensor(vec_dic[sense])).detach().numpy()
  kl_preds[sense] = pred
  return toTensor(pred)

# Returns a probability distribution p(?|sense) for a given sense that is proportional to softmax of ? and sense's similarity in word2vec
def getW2VDistributionSoftmax(sense):
  sims = []
  x = np.array(stringToVec(sense))
  for s in senses:
    try:
      y = np.array(stringToVec(s))
      sim = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
      sims.append(sim)
    except:
      sims.append(0)
  probs = nn.LogSoftmax()(toTensor(sims))
  return toTensor(probs)

# Returns a probability distribution p(?|sense) for a given sense that is proportional to ? and sense's similarity in word2vec
def getW2VDistribution(sense):
  if sense in w2v_preds:
    sims = w2v_preds[sense]
    return toTensor(np.log(sims/sum(sims)))
  sims = []
  x = np.array(stringToVec(sense))
  for s in senses:
    try:
      y = np.array(stringToVec(s))
      sim = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
      sims.append(sim)
    except:
      sims.append(0)
  sims = [max(i, 0.00000001) for i in sims]
  sims = np.array(sims)
  w2v_preds[sense] = sims
  probs = np.log(sims/sum(sims))
  return toTensor(probs)

# Returns a probability distribution p(?|sense) for a given sense that is proportional to softmax of ? and sense's similarity in Phrase-BERT
def getBERTDistributionSoftmax(sense):
  sims = []
  x = np.array(vec_dic[sense])
  for s in senses:
    try:
      y = np.array(vec_dic[s])
      sim = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
      sims.append(sim)
    except:
      sims.append(0)
  func = nn.LogSoftmax()
  probs = func(toTensor(sims))
  return toTensor(probs)

# Returns a probability distribution p(?|sense) for a given sense that is proportional to ? and sense's similarity in Phrase-BERT
def getBERTDistribution(sense):
  if sense in bert_preds:
    sims = bert_preds[sense]
    return toTensor(np.log(sims/sum(sims)))
  sims = []
  x = np.array(vec_dic[sense])
  for s in senses:
    try:
      y = np.array(vec_dic[s])
      sim = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))
      sims.append(sim)
    except:
      sims.append(0)
  sims = [max(i, 0.000001) for i in sims]
  sims = np.array(sims)
  bert_preds[sense] = sims
  probs = np.log(sims/sum(sims))
  return toTensor(probs)

# Returns a random probability distribution of senses where probabilities are chosen uniformly at random
def getRandomProbs(sense="mitochondria is"):
  vec = np.random.rand(len(senses))
  return toTensor(np.log(vec/sum(vec)))

# Returns a random probability distribution of senses where probabilities are chosen based on the softmax of a uniform random distribution
def getRandomProbsSoftmax(sense="powerhouse of the cell"):
  vec = toTensor((np.random.rand(len(senses))-0.5)*10)
  vec = nn.Softmax()(vec).data
  return toTensor(np.log(vec))

In [None]:
#@title
# In this cell, we measure model performance via how well they rank how likely different senses are to colexify with senses in the test set
# More specifically, we measure the average difference between actual and predicted rankings for each sense in the test set using getRankDiff()

top_n = 5 # top_n is the number n that we look at when computing how similar rankings are
total_diffs = np.array([0, 0, 0, 0.0])
rank_funcs = [getActualRanking, getRegRanking, getKLRanking, getW2VSimilarityRanking, getBERTSimilarityRanking]
for s in test:
  ranks = [rank_funcs[i](s) for i in range(len(rank_funcs))]
  diffs = np.array([getRankDiff(ranks[0], ranks[i], top_n) for i in range(1, len(ranks))])
  total_diffs += diffs
total_diffs = total_diffs/len(test)

print("Average difference of ranking of Regression model predictions: ", total_diffs[0])
print("Average difference of ranking of KL model predictions: ", total_diffs[1])
print("Average difference of ranking of W2V similarities: ", total_diffs[2])
print("Average difference of ranking of BERT similarities: ", total_diffs[3])


In [None]:
#@title
# In this cell, we measure model performance via how close their probability distributions are to the actual distribution
# More specifically, we masure the average KL Divergence between the predicted and actual distributions

total_loss = np.array([0, 0, 0, 0, 0, 0, 0, 0.0])
distrib_funcs = [getRegDistribution, getKLDistribution, getBERTDistribution, getBERTDistributionSoftmax, getBERTDistribution, getBERTDistributionSoftmax, getRandomProbs, getRandomProbsSoftmax]
for s in test:
  actual = test_prob_dic[s]
  for i in range(len(total_loss)):
    total_loss[i] += np.array(nn.KLDivLoss()(toTensor(distrib_funcs[i](s)), actual).data)
total_loss = np.array(total_loss)
total_loss = total_loss/len(test)

print("Average loss for Reg model: ", total_loss[0])
print("Average loss for KL model: ", total_loss[1])
print("Average loss for distributions proportional to W2V similarity: ", total_loss[2])
print("Average loss for distributions proportional to softmax of W2V similarity: ", total_loss[3])
print("Average loss for distributions proportional to BERT similarity: ", total_loss[4])
print("Average loss for distributions proportional to softmax of BERT similarity: ", total_loss[5])
print("Average loss for distributions proportional to random vector: ", total_loss[6])
print("Average loss for distributions proportional to softmax of random vector: ", total_loss[7])

In [None]:
#@title
# Here we create matrices keeping track of word2vec similarity, phrase-BERT similarity, colexification frequency, and predicted probabilities p(a | b) by our model
# Each row i and column j keeps track of the similarity, colexification frequency, ect., of senses s_i and s_j
# We then compute the correlations between colexification frequency and each other matrix

xo_arr = []
yo_arr = []
predicted_freq_arr = []
predicted_prob_arr = []
W2V_similarity_arr = []
BERT_similarity_arr = []
freq_arr = []
random_arr = []
for i in range(len(test)):
  for s in senses:
    xo_arr.append(test[i])
    yo_arr.append(s)
    predicted_freq_arr.append(float(reg_preds[test[i]][sense_indices[s]]))
    predicted_prob_arr.append(float(np.exp(kl_preds[test[i]][sense_indices[s]])))
    W2V_similarity_arr.append(float(bert_preds[test[i]][sense_indices[s]]))
    BERT_similarity_arr.append(float(bert_preds[test[i]][sense_indices[s]]))
    freq_arr.append(float(amts[test[i]]*senseToProbVec[test[i]][sense_indices[s]]))
    random_arr.append(r.random())

predictionReg_scorrelation, _ = stats.spearmanr(freq_arr,predicted_freq_arr)
predictionKL_scorrelation, _ = stats.spearmanr(freq_arr,predicted_prob_arr)
W2V_similarity_scorrelation, _ = stats.spearmanr(freq_arr, W2V_similarity_arr)
BERT_similarity_scorrelation, _ = stats.spearmanr(freq_arr,BERT_similarity_arr)
random_scorrelation, _ = stats.spearmanr(freq_arr,random_arr)

print("Spearman correlation of predicted frequencies and colexification frequency (zeros unremoved): ", predictionReg_scorrelation)
print("Spearman correlation of predicted conditional probabilities and colexification frequency (zeros unremoved): ", predictionKL_scorrelation)
print("Spearman correlation of Word2Vec similarity and colexification frequency (zeros unremoved): ", W2V_similarity_scorrelation)
print("Spearman correlation of Phrase-BERT similarity and colexification frequency (zeros unremoved): ", BERT_similarity_scorrelation)
print("Spearman correlation of random variable and colexification frequency (zeros unremoved): ", random_scorrelation)
print("")

xo_arr_removed = []
yo_arr_removed = []
W2V_similarity_arr_removed = []
BERT_similarity_arr_removed = []
freq_arr_removed = []
predicted_freq_arr_removed = []
predicted_prob_arr_removed = []
random_arr_removed = []
for i in range(len(freq_arr)):
  if float(freq_arr[i]) > 0:
    W2V_similarity_arr_removed.append(W2V_similarity_arr[i])
    BERT_similarity_arr_removed.append(BERT_similarity_arr[i])
    freq_arr_removed.append(freq_arr[i])
    random_arr_removed.append(random_arr[i])
    predicted_prob_arr_removed.append(predicted_prob_arr[i])
    predicted_freq_arr_removed.append(predicted_freq_arr[i])
    xo_arr_removed.append(xo_arr[i])
    yo_arr_removed.append(yo_arr[i])

predictionReg_removed_scorrelation, _ = stats.spearmanr(freq_arr_removed,predicted_freq_arr_removed)
predictionKL_removed_scorrelation, _ = stats.spearmanr(freq_arr_removed,predicted_prob_arr_removed)
W2V_similarity_removed_scorrelation, _ = stats.spearmanr(freq_arr_removed, W2V_similarity_arr_removed)
BERT_similarity_removed_scorrelation, _ = stats.spearmanr(freq_arr_removed,BERT_similarity_arr_removed)
random_removed_scorrelation, _ = stats.spearmanr(freq_arr_removed,random_arr_removed)

print("Spearman correlation of predicted frequencies and colexification frequency (zeros removed): ", predictionReg_removed_scorrelation)
print("Spearman correlation of predicted conditional probabilities and colexification frequency (zeros removed): ", predictionKL_removed_scorrelation)
print("Spearman correlation of Word2Vec similarity and colexification frequency (zeros removed): ", W2V_similarity_removed_scorrelation)
print("Spearman correlation of Phrase-BERT similarity and colexification frequency (zeros removed): ", BERT_similarity_removed_scorrelation)
print("Spearman correlation of random variable and colexification frequency (zeros removed): ", random_removed_scorrelation)

xo_arr_removed = []
yo_arr_removed = []
W2V_similarity_arr_removed = []
BERT_similarity_arr_removed = []
freq_arr_removed = []
predicted_freq_arr_removed = []
predicted_prob_arr_removed = []
random_arr_removed = []
for i in range(len(freq_arr)):
  if float(freq_arr[i]) > 1:
    W2V_similarity_arr_removed.append(W2V_similarity_arr[i])
    BERT_similarity_arr_removed.append(BERT_similarity_arr[i])
    freq_arr_removed.append(freq_arr[i])
    random_arr_removed.append(random_arr[i])
    predicted_prob_arr_removed.append(predicted_prob_arr[i])
    predicted_freq_arr_removed.append(predicted_freq_arr[i])
    xo_arr_removed.append(xo_arr[i])
    yo_arr_removed.append(yo_arr[i])

Spearman correlation of predicted frequencies and colexification frequency (zeros unremoved):  0.02007701531708646
Spearman correlation of predicted conditional probabilities and colexification frequency (zeros unremoved):  0.1992679073279547
Spearman correlation of Word2Vec similarity and colexification frequency (zeros unremoved):  0.09805313632176639
Spearman correlation of Phrase-BERT similarity and colexification frequency (zeros unremoved):  0.09805313632176639
Spearman correlation of random variable and colexification frequency (zeros unremoved):  0.001054165604900811

Spearman correlation of predicted frequencies and colexification frequency (zeros removed):  -0.010658352963561166
Spearman correlation of predicted conditional probabilities and colexification frequency (zeros removed):  0.2798600747628028
Spearman correlation of Word2Vec similarity and colexification frequency (zeros removed):  0.23941054022060088
Spearman correlation of Phrase-BERT similarity and colexification

In [None]:
#@title
# This is a dataframe we can use to train a linear regression model. Looking at the coefficients of this model will be a metric for how useful each feature is. Everything is normalized to not affect these coefficients.

dic = {"Concepticon_gloss.xo":xo_arr_removed, "Concepticon_gloss.yo":yo_arr_removed,"w2v_similarity":np.array(W2V_similarity_arr_removed)/sum(W2V_similarity_arr_removed), "phrase-BERT_similarity":np.array(BERT_similarity_arr_removed)/sum(BERT_similarity_arr_removed), "predicted_conditional_prob":np.array(predicted_prob_arr_removed)/sum(predicted_prob_arr_removed),"predicted_freq":np.array(predicted_freq_arr_removed)/sum(predicted_freq_arr_removed), "colex.freq":freq_arr_removed}
df_linreg = pd.DataFrame.from_dict(dic)
df_linreg.head()

Unnamed: 0,Concepticon_gloss.xo,Concepticon_gloss.yo,w2v_similarity,phrase-BERT_similarity,predicted_conditional_prob,predicted_freq,colex.freq
0,PESTLE,CUT DOWN,0.000294,0.000294,2.9e-05,0.000279,2.0
1,PESTLE,LONG,0.000195,0.000195,2e-05,0.000282,2.0
2,PESTLE,BURNING,0.000302,0.000302,7.5e-05,0.00028,2.0
3,PESTLE,PITCHFORK,0.000357,0.000357,0.000146,0.000282,2.0
4,PESTLE,PAIR,0.000196,0.000196,8e-05,0.00028,2.0


In [None]:
#@title
# This is a dataframe we can use to train logistic regression models. We add cases where colexification doesn't occur in a 50-50 ration to those where it does occur, as otherwise there would be too many "FALSE" cases. We say colexification occurs iff it happens in >1 language. 

pairs2 = set()
for i in range(len(xo_arr_removed)):
  pairs2.add(xo_arr_removed[i]+yo_arr_removed[i])
  pairs2.add(yo_arr_removed[i]+xo_arr_removed[i])
dic = {"Concepticon_gloss.xo":xo_arr_removed.copy(), "Concepticon_gloss.yo":yo_arr_removed.copy(),"w2v_similarity":W2V_similarity_arr_removed.copy(), "phrase-BERT_similarity":BERT_similarity_arr_removed.copy(), "predicted_conditional_prob":predicted_prob_arr_removed.copy(), "predicted_freq":predicted_freq_arr_removed.copy(), "colex":[i >1 for i in freq_arr_removed.copy()]}
for i in range(len(dic["colex"])):
  x = test[r.randint(0, len(test)-1)]
  y = senses[r.randint(0, len(senses)-1)]
  while x+y in pairs2:
    x = test[r.randint(0, len(test)-1)]
    y = senses[r.randint(0, len(senses)-1)]
  pairs2.add(x+y)
  dic["Concepticon_gloss.xo"].append(x)
  dic["Concepticon_gloss.yo"].append(y)
  dic["w2v_similarity"].append(getSimilarity(vec_dic[x], vec_dic[y]))#.append(getSimilarity(stringToVec(x), stringToVec(y)))
  dic["phrase-BERT_similarity"].append(getSimilarity(vec_dic[x], vec_dic[y]))
  dic["predicted_conditional_prob"].append(float(np.exp(kl_preds[x][sense_indices[y]])))
  dic["predicted_freq"].append(float(reg_preds[x][sense_indices[y]]))
  dic["colex"].append(False)
df_logreg = pd.DataFrame.from_dict(dic)
df_logreg.head()

Unnamed: 0,Concepticon_gloss.xo,Concepticon_gloss.yo,w2v_similarity,phrase-BERT_similarity,predicted_conditional_prob,predicted_freq,colex
0,PESTLE,CUT DOWN,0.616072,0.616072,0.000475,2.171032,True
1,PESTLE,LONG,0.407906,0.407906,0.000329,2.188267,True
2,PESTLE,BURNING,0.633319,0.633319,0.001233,2.178457,True
3,PESTLE,PITCHFORK,0.747537,0.747537,0.0024,2.189185,True
4,PESTLE,PAIR,0.411604,0.411604,0.001317,2.178319,True


In [None]:
#@title
# Trains linear regression and outputs coefficients for each variable

df_linreg_cont = df_linreg.drop(["Concepticon_gloss.xo", "Concepticon_gloss.yo"], axis = 1)
X_linreg = df_linreg_cont.drop("colex.freq", axis = 1)
y_linreg = df_linreg_cont["colex.freq"]
ridge = Ridge(alpha = 1.12, random_state = 0)
ridge.fit(X_linreg, y_linreg)
print(ridge.coef_)

[8.98889202e-01 8.98889202e-01 1.40463815e+01 2.83258041e-03]


In [None]:
#@title
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Trains logistic regression model to predict whether colexification will occur on each feature and a combined model to see if using multiple pieces of data is very helpful

df_logreg_cont = df_logreg.drop(["Concepticon_gloss.xo", "Concepticon_gloss.yo"], axis = 1)
logtrain, logtest = train_test_split(df_logreg_cont, test_size=0.2)
X_logtrain = logtrain.drop("colex", axis = 1)
y_logtrain = logtrain["colex"]
X_logtest = logtest.drop("colex", axis = 1)
y_logtest = logtest["colex"]

logreg_w2v = LogisticRegression(random_state=0)
logreg_BERT = LogisticRegression(random_state=0)
logreg_kl = LogisticRegression(random_state=0)
logreg_reg = LogisticRegression(random_state=0)
logreg_comb = LogisticRegression(random_state=0)
logreg_w2v.fit(X_logtrain["w2v_similarity"].to_frame(), y_logtrain)
logreg_BERT.fit(X_logtrain["phrase-BERT_similarity"].to_frame(), y_logtrain)
logreg_reg.fit(X_logtrain["predicted_freq"].to_frame(), y_logtrain)
logreg_kl.fit(X_logtrain["predicted_conditional_prob"].to_frame(), y_logtrain)
logreg_comb.fit(X_logtrain, y_logtrain)

# Prints accuracies of these models
w2v_acc = metrics.accuracy_score(y_logtest, logreg_w2v.predict(X_logtest["w2v_similarity"].to_frame()))
BERT_acc = metrics.accuracy_score(y_logtest, logreg_BERT.predict(X_logtest["phrase-BERT_similarity"].to_frame()))
reg_acc = metrics.accuracy_score(y_logtest, logreg_reg.predict(X_logtest["predicted_freq"].to_frame()))
kl_acc = metrics.accuracy_score(y_logtest, logreg_kl.predict(X_logtest["predicted_conditional_prob"].to_frame()))
comb_acc = metrics.accuracy_score(y_logtest, logreg_comb.predict(X_logtest))

print("Accuracy of w2v logistic regression: ", w2v_acc)
print("Accuracy of phrase-BERT logistic regression: ", BERT_acc)
print("Accuracy of colexification frequency prediction model logistic regression: ", reg_acc)
print("Accuracy of conditional probability prediction model logistic regression: ", kl_acc)
print("Accuracy of combined logistic regression: ", comb_acc)

Accuracy of w2v logistic regression:  0.6781206171107994
Accuracy of phrase-BERT logistic regression:  0.6781206171107994
Accuracy of colexification frequency prediction model logistic regression:  0.5518934081346424
Accuracy of conditional probability prediction model logistic regression:  0.6430575035063114
Accuracy of combined logistic regression:  0.6823281907433381


In [None]:
#@title
# Ignore these; they were cells I was using when trying to "smooth out" probability distributions using kernel density estimation. This took forever and didn't work out that well.

#vecs = []
#for i in range(len(df)): 
#  print(i)
#  row = df.iloc[i]
#  x = row["Concepticon_Gloss.xo"]
#  y = row["Concepticon_Gloss.yo"]
#  pair_tens_1 = np.concatenate((vec_dic[x], vec_dic[y]))
#  pair_tens_2 = np.concatenate((vec_dic[y], vec_dic[x]))
#  for j in range(int(row["colex.freq"])):
#    if r.random() < 0.5:
#      vecs.append(pair_tens_1)
#    else:
#      vecs.append(pair_tens_2)
#alldata = np.stack(vecs)

In [None]:
#@title
#from sklearn.neighbors import KernelDensity
#kde_skl = KernelDensity(bandwidth=0.5)
#kde_skl.fit(alldata)

In [None]:
#@title
#senseToProbVecSmooth = {} 
#for x in senses: 
  #senseToProbVecSmooth[x] = []
  #senseToProbVecSmooth[x] = kde_skl.score_samples(np.stack([np.concatenate((vec_dic[x],vec_dic[y])) for y in senses]))
  #senseToProbVecSmooth[x] = np.array(senseToProbVecSmooth[x])
  #senseToProbVecSmooth[x] = nn.Softmax()(senseToProbVecSmooth[x])
  #print(senseToProbVecSmooth[x])