<a href="https://colab.research.google.com/github/o-fugi/FURSPColexification/blob/main/code/Prediction_with_GMM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/ColabFiles/

Mounted at /content/drive/
/content/drive/MyDrive/ColabFiles


In [2]:
import pandas as pd
import re
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from functools import reduce
import matplotlib as mpl
import torch

In [3]:
# install sentence-trasnformers
%%capture
! pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
model = SentenceTransformer('whaleloops/phrase-bert')

In [4]:
# import semantic shift dataset

sem_shift_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/Project 10 Datasets/cleaned_dat_sem_shift.csv')
sem_shift_df['meaning1'] = sem_shift_df['meaning1_clean']
sem_shift_df['meaning2'] = sem_shift_df['meaning2_clean']

sem_shift_df.at[697, 'meaning1'] = 'furuncul'
sem_shift_df.at[1521, 'meaning2'] = 'geometrid'

shift_class_df = sem_shift_df

In [5]:
#@title run this cell if you want to load english data

# # english data? 

# shift_class_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/cleaned_classified_df.csv', encoding='ISO-8859-1')

# # remove NaN rows
# shift_class_df['Bleached sense'] = shift_class_df['Bleached sense'].fillna('NaN')
# shift_class_df = shift_class_df[shift_class_df['Bleached sense']!='NaN']

# # remove completely duplicated rows
# shift_class_df = shift_class_df[~shift_class_df.duplicated()]

# # these seem a little outside of our area of study 
# # shift_class_df = shift_class_df[shift_class_df['Type of change']!= 'grammaticalization']
# # shift_class_df = shift_class_df[shift_class_df['Type of change']!= 'synaesthesia']

# allowed_types = ['metaphor', 'narrowing', 'pejoration', 'broadening', 'metonymy', 'amelioration', 'antonymy', 'analogy', 'broadening, metaphor', 'pejoration, homophony']
# shift_class_df = shift_class_df[shift_class_df['Type of change'].isin(allowed_types)]

# shift_class_df = shift_class_df.reset_index()

# # get rid of punctuation in the senses for the English data

# def cleanString(s):
#   s = s.lower()
#   s = s.replace("'", "")
#   s = s.replace(",", "")
#   s = s.replace(";", "")
#   s = s.lower()
#   return s

# shift_class_df['meaning1'] = shift_class_df['meaning1'].apply(cleanString)
# shift_class_df['meaning2'] = shift_class_df['meaning2'].apply(cleanString)

## PCA

In [6]:
# create a dictionary for the embeddings
vec_dic = {} # This will be a dictionary that easily allows us to access the embedding for all of our senses, saving time. 
error_senses = set()  # This represents the set of senses for which there was a problem converting them to embeddings or concreteness values

encoding_len = len(model.encode('yikes'))

for i in range(len(shift_class_df)): # Here we loop through each row of our dataframe, and if we can convert a sense s to an embedding then we set vec_dic[s] = embedding
  row = shift_class_df.iloc[i]
  x = row["meaning1"]
  y = row["meaning2"]

  try:   
    if x not in vec_dic:
      xvec = np.array(model.encode(x))
      vec_dic[x] = xvec
  except:
    error_senses.add(x)

  try:  
    if y not in vec_dic:
      yvec = np.array(model.encode(y))
      vec_dic[y] = yvec
  except: 
    error_senses.add(y)

error_senses = list(error_senses) # List of all senses that could not be converted to embeddings. Should be empty right now with phrase BERT
senses = list(vec_dic.keys()) # List of all concepts

sense_indices = {senses[i]:i for i in range(len(senses))} # sense_indices is a dictionary where its keys are senses and its values are the indices for which the senses appear in our list of senses.

In [7]:
# create dataframe with a pair of embeddings for each shift

all_vars_df = pd.DataFrame()
all_vars_df['meaning1'] = shift_class_df['meaning1']
all_vars_df['meaning2'] = shift_class_df['meaning2']

# #if working with the English database, these are helpful
# all_vars_df['word'] = shift_class_df['Word']
# all_vars_df['type'] = shift_class_df['Type of change']

vec_df = pd.DataFrame.from_dict(vec_dic, orient='index').reset_index().rename(columns={'index':'Word'})
# vec_df_normalized = vec_df[range(encoding_len)].div(np.linalg.norm(vec_df[range(encoding_len)], axis=1), axis=0) # idea from Gemma's paper 
# vec_df_normalized['Word'] = vec_df['Word']
vec_meaning_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning1',right_on='Word', how='left'), [all_vars_df, vec_df])
vec_meaning_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning2',right_on='Word', how='left'), [vec_meaning_df, vec_df])
vec_meaning_df = vec_meaning_df.drop(['Word_x', 'Word_y'], axis=1)

In [8]:
# create dataframe with one difference embedding for each shift

vec_diff_df = pd.DataFrame()

for i in range(encoding_len):
  vec_diff_df[i] = vec_meaning_df[str(i) + "_y"] - vec_meaning_df[str(i) + "_x"]
  # vec_diff_df[i + encoding_len] = vec_diff_df[i]**2 # idea from Gemma's paper

  


In [9]:
# create dataframe with all shifts and difference vectors 

source_shift_df = reduce(lambda  left,right: pd.merge(left,right,left_index=True,right_index=True, how='left'), [all_vars_df, vec_diff_df])

In [10]:
# perform pca, just so the gaussian mixture will perform a little better
# x = source_shift_df.drop(['word', 'meaning1', 'meaning2', 'type'], axis=1).values # if working with English database
x = source_shift_df.drop(['meaning1', 'meaning2'], axis=1).values
scaler = StandardScaler()
scaler.fit(x)
x_scale = scaler.transform(x)

# do PCA
pca = PCA(n_components=50)
pca.fit(x_scale)
components = pca.transform(x_scale)
components_df = pd.DataFrame(data = components)#.rename(columns={0:'PC_1' , 1:'PC_2', 2:"PC_3", 3:'PC_4'})

# merge back into word data
df = reduce(lambda  left,right: pd.merge(left,right, left_index=True, right_index=True), [all_vars_df, components_df])

In [11]:
#@title running GMM on the whole dataset (not useful for prediction tasks)

# # group the shifts

# n_dimensions = 50
# n_components = 30
# estimator = GaussianMixture(n_components=n_components, covariance_type='full', init_params='kmeans', max_iter=20, random_state=1) # other covariance is "spherical", "diag", "tied"
# estimator.fit(df[range(n_dimensions)])

# label_df = pd.DataFrame(estimator.predict(np.asarray(df[range(50)])))
# label_df = label_df.rename(columns={0:'label'})

# df = reduce(lambda  left,right: pd.merge(left,right,left_index=True,right_index=True, how='left'), [df, label_df])

## testing models

In [17]:
# define prediction models! 

# return the target that's most similar to the source
def get_most_similar(source, potential_targets, similarity='euclidean'):
  dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    dists.append(np.linalg.norm(diff_vec))

  best_target = np.argmin(dists)
  rank = list(np.argsort(dists)).index(0) + 1

  return rank

# return the shift that's closest to all other shifts in the train dataset, on average
def get_dist_avg(source, potential_targets, similarity='euclidean'):
  average_dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform
    diff_df = pd.DataFrame([diff_vec_pca[0]])
    diff_df = pd.DataFrame(np.repeat(diff_df.values, len(train_df), axis=0), columns=diff_df.columns)

    if similarity=='cosine':
      dists = np.inner(train_df[list(range(50))], diff_vec_pca).T / (np.linalg.norm(train_df[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))
    else:
      dists = np.linalg.norm(train_df[list(range(50))] - diff_df, axis=1)

    average_dists.append(np.average(dists))
  
  best_target = np.argmin(average_dists)
  rank = list(np.argsort(average_dists)).index(0) + 1

  return rank

# return the shift that's the most similar to any other shift (this is the "analogy" model)
def get_dist_closest(source, potential_targets, similarity='euclidean'):
  min_dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform
    diff_df = pd.DataFrame([diff_vec_pca[0]])
    diff_df = pd.DataFrame(np.repeat(diff_df.values, len(train_df), axis=0), columns=diff_df.columns)

    if similarity=='cosine':
      dists = np.inner(train_df[list(range(50))], diff_vec_pca).T / (np.linalg.norm(train_df[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))
    else:
      dists = np.linalg.norm(train_df[list(range(50))] - diff_df, axis=1)
    # print(source, "->", target, "is similar to", train_df.iloc[np.argmin(dists)][['meaning1', 'meaning2']].values)
    
    min_dists.append(np.min(np.abs(dists)))
  
  best_target = np.argmin(min_dists)
  rank = list(np.argsort(min_dists)).index(0) + 1
  
  return rank

# return the shift that's closest to all other shifts in its GMM cluster, on average
def get_gmm_avg(source, potential_targets, similarity='euclidean'):
  average_dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform

    # closest cluster, by GMM automatic labelling 
    cluster = estimator.predict(diff_vec_pca)[0]

    # filter train dataset to only include that cluster 
    train_df_source = train_df[train_df['label']==cluster].reset_index()

    diff_df = pd.DataFrame([diff_vec_pca[0]])
    diff_df = pd.DataFrame(np.repeat(diff_df.values, len(train_df_source), axis=0), columns=diff_df.columns)

    if similarity=='cosine':
      dists = np.inner(train_df_source[list(range(50))], diff_vec_pca).T / (np.linalg.norm(train_df_source[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))
    else:
      dists = np.linalg.norm(train_df_source[list(range(50))] - diff_df, axis=1)

    average_dists.append(np.average(dists))
  
  best_target = np.argmin(average_dists)
  rank = list(np.argsort(average_dists)).index(0) + 1

  return rank

# return the shift that's closest to the center of its GMM cluster
def get_gmm_center(source, potential_targets, similarity='euclidean'):
  dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform

    # closest cluster, by GMM automatic labelling 
    cluster = estimator.predict(diff_vec_pca)[0]

    if similarity=='cosine':
      dist = np.inner(estimator.means_[cluster], diff_vec_pca).T / (np.linalg.norm(estimator.means_[cluster]) * np.linalg.norm(diff_vec_pca))
    else:
      dist = np.linalg.norm(estimator.means_[cluster] - diff_vec_pca)

    dists.append(dist)
  
  best_target = np.argmin(dists)

  #debug
  # if best_target==0:
  #   target = potential_targets[0]
  #   diff_vec = vec_dic[target] - vec_dic[source]
  #   diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform
  #   cluster = estimator.predict(diff_vec_pca)[0]
  #   print(source, "->", potential_targets[0], "belongs to cluster", cluster, gmm_labels[0][cluster], "->", gmm_labels[1][cluster])

  rank = list(np.argsort(dists)).index(0) + 1

  return rank

# same as above, but is the sum across all clusters, weighted by the probability that the shift is in that cluster
def get_gmm_weighted(source, potential_targets):
  dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform

    dist = 0

    for center, prob in zip(estimator.means_, estimator.predict_proba(diff_vec_pca)[0]):
      dist += np.linalg.norm(center - diff_vec_pca) * prob

    dists.append(dist)
  
  best_target = np.argmin(dists)

  rank = list(np.argsort(dists)).index(0) + 1

  return rank

# returns the shift with the highest probability of being in any cluster 
def get_gmm_prob(source, potential_targets):
  probs = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform

    prob = max(estimator.predict_proba(diff_vec_pca)[0])

    probs.append(prob)
  
  best_target = np.argmax(probs)

  rank = list(np.argsort(probs)[::-1]).index(0) + 1

  return rank

In [13]:
# define the method of getting a list of targets whose similarity to the source is about the same as the actual target
# e.g., if dist("food", "meat") = 2, then this returns a list of targets whose distance from "food" is about 2

def get_potential_targets(source, target):
  potential_targets = []
  potential_targets.append(target) # the ACTUAL target
  df_targets = df[df['meaning1']!=source][['meaning2']] # only take elements of the dataframe that don't have source as meaning1 
  df_targets = df_targets.drop_duplicates()
  df_targets = df_targets[df_targets['meaning2']!=target]

  df_targets = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning2',right_on='Word', how='left'), [df_targets, vec_df])
  dists = np.linalg.norm(df_targets[range(encoding_len)] - vec_dic[source], axis=1) # get list of similarities between source and ALL targets
  dists = abs(dists - np.linalg.norm(vec_dic[target] - vec_dic[source])) # ge

  indices = np.argsort(dists)[0:4]

  potential_targets += list(df_targets.iloc[indices]['meaning2']) # take the 4 targets whose similarity to the source is closest to the actual similarity

  # potential_targets += list(df_targets['meaning2'].sample(n=4))

  return potential_targets

In [14]:
#@title debugging -- get the labels for each of the clusters

# # get list of adjectives from frequency database

# freq_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/COCA_freqs.csv', encoding='ISO-8859-1') # w1, coca_spok
# adjective_df = freq_df[freq_df['c1']=='jj'].sort_values(by='SOAP', ascending=False)
# adjectives = list(adjective_df.head(500)['w1'])

# # get the embedding of those adjectives

# adj_embeddings = model.encode(adjectives, convert_to_tensor=True)

# top_k = 5 # how many adjectives do you want? 

# def toTensor(arr):
#   return torch.tensor(arr, dtype = torch.float32) #Function to easily convert arrays to tensors

# # function to get labels of the gmm clusters ... only necessary for debugging, mostly

# def get_labels(estimator):
#   source_words = []
#   target_words = []

#   for i in range(len(estimator.means_)):
#       # target
#       targets = []

#       query_embedding = toTensor(estimator.means_[i])

#       cos_scores = util.cos_sim(query_embedding, toTensor(pca.transform(adj_embeddings)))[0]
#       top_results = torch.topk(cos_scores, k=top_k)

#       for score, idx in zip(top_results[0], top_results[1]):
#         targets.append(adjectives[idx])

#       # source
#       sources = []

#       query_embedding = -toTensor(estimator.means_[i])

#       cos_scores = util.cos_sim(query_embedding, toTensor(pca.transform(adj_embeddings)))[0]
#       top_results = torch.topk(cos_scores, k=top_k)

#       for score, idx in zip(top_results[0], top_results[1]):
#         sources.append(adjectives[idx])
      
#       source_words.append(sources)
#       target_words.append(targets)

#   return source_words, target_words

In [20]:
# shuffle df for cross-validation

n_components = 75
n_dimensions = 50

df = df.sample(frac=1).reset_index(drop=True)
k = 5

# keep track of accuracies
dist_avg = []
dist_closest = []
gmm_avg = []
gmm_center = []
most_similar = []
gmm_weighted = []
gmm_prob = []

dist_avg_mrr = []
dist_closest_mrr = []
gmm_avg_mrr = []
gmm_center_mrr = []
most_similar_mrr = []
gmm_weighted_mrr = []
gmm_prob_mrr = []

for i in range(k):
  # if i != 0:
  #  continue
  # set up train and test dfs
  test_df = df.loc[(int(len(df)/k)*i):(int(len(df)/k)*(i+1))]
  train_df = df.drop(test_df.index)

  train_df = train_df.reset_index(drop=True)
  test_df = test_df.reset_index(drop=True)

  # train and label GMM data
  estimator = GaussianMixture(n_components=n_components, covariance_type='spherical', init_params='kmeans', max_iter=30, random_state=1) # other covariance is "spherical", "diag", "tied"
  estimator.fit(train_df[range(n_dimensions)])

  label_df = pd.DataFrame(estimator.predict(np.asarray(train_df[range(50)])))
  label_df = label_df.rename(columns={0:'label'})
  train_df = reduce(lambda  left,right: pd.merge(left,right,left_index=True,right_index=True, how='left'), [train_df, label_df])

  # only run this if you've run the "debugging" cell above -- for labelling each of the clusters
  # gmm_labels = get_labels(estimator)

  num_correct_most_similar = 0
  num_correct_dist_avg = 0
  num_correct_dist_closest = 0
  num_correct_gmm_avg = 0
  num_correct_gmm_center = 0
  num_correct_gmm_weighted = 0
  num_correct_gmm_prob = 0

  total_most_similar_mrr = 0
  total_dist_avg_mrr = 0
  total_dist_closest_mrr = 0
  total_gmm_avg_mrr = 0
  total_gmm_center_mrr = 0
  total_gmm_weighted_mrr = 0
  total_gmm_prob_mrr = 0

  # use cosine similarity / euclidean distance

  for i in range(len(test_df)):
    row = test_df.iloc[i]
    source = row['meaning1']
    target = row['meaning2']
    
    # make list of potential targets to choose from
    potential_targets = get_potential_targets(source, target)
    # print(source, potential_targets)

    average_dists = []

    most_similar_result = get_most_similar(source, potential_targets)
    if most_similar_result==1:
      num_correct_most_similar+=1
    total_most_similar_mrr += 1/most_similar_result

    dist_avg_result = get_dist_avg(source, potential_targets)
    if dist_avg_result==1:
      num_correct_dist_avg +=1
    total_dist_avg_mrr += 1/dist_avg_result

    dist_closest_result = get_dist_closest(source, potential_targets)
    if dist_closest_result==1:
      num_correct_dist_closest +=1
    total_dist_closest_mrr += 1/dist_closest_result
    
    gmm_avg_result = get_gmm_avg(source, potential_targets)
    if gmm_avg_result==1:
      num_correct_gmm_avg += 1
    total_gmm_avg_mrr += 1/gmm_avg_result

    gmm_center_result = get_gmm_center(source, potential_targets)
    if gmm_center_result==1:
      num_correct_gmm_center +=1
    total_gmm_center_mrr+= 1/gmm_center_result

    gmm_weighted_result = get_gmm_weighted(source, potential_targets)
    if gmm_weighted_result==1:
      num_correct_gmm_weighted+=1
    total_gmm_weighted_mrr += 1/gmm_weighted_result

    gmm_prob_result = get_gmm_prob(source, potential_targets)
    if gmm_prob_result==1:
      num_correct_gmm_prob+=1
    total_gmm_prob_mrr += 1/gmm_prob_result

  most_similar.append(num_correct_most_similar)
  dist_avg.append(num_correct_dist_avg)
  dist_closest.append(num_correct_dist_closest)
  gmm_avg.append(num_correct_gmm_avg)
  gmm_center.append(num_correct_gmm_center)
  gmm_weighted.append(num_correct_gmm_weighted)
  gmm_prob.append(num_correct_gmm_prob)

  most_similar_mrr.append(total_most_similar_mrr)
  dist_avg_mrr.append(total_dist_avg_mrr)
  dist_closest_mrr.append(total_dist_closest_mrr)
  gmm_avg_mrr.append(total_gmm_avg_mrr)
  gmm_center_mrr.append(total_gmm_center_mrr)
  gmm_weighted_mrr.append(total_gmm_weighted_mrr)
  gmm_prob_mrr.append(total_gmm_prob_mrr)



In [None]:
#@title manually evaluate GMM model

# n = 15
# print(gmm_labels[0][n])
# print(gmm_labels[1][n])
# train_df[train_df['label']==n][['meaning1', 'meaning2']].head(30)

In [21]:
# Summarize accuracies
print("Accuracy")
print("most similar", np.mean(most_similar)/len(test_df))
print("average distance (brute force 1)", np.mean(dist_avg)/len(test_df))
print("analogy (brute force 2)", np.mean(dist_closest)/len(test_df))
print("average distance GMM", np.mean(gmm_avg)/len(test_df))
print("dist to center GMM", np.mean(gmm_center)/len(test_df))
print("dist to center weighted GMM", np.mean(gmm_weighted)/len(test_df))
print("probability of most probable cluster GMM", np.mean(gmm_prob)/len(test_df))
print("\nMRR")
print("most similar", np.mean(most_similar_mrr))
print("average distance (brute force 1)", np.mean(dist_avg_mrr))
print("analogy (brute force 2)", np.mean(dist_closest_mrr))
print("average distance GMM", np.mean(gmm_avg_mrr))
print("dist to center GMM", np.mean(gmm_center_mrr))
print("dist to center weighted GMM", np.mean(gmm_weighted_mrr))
print("probability of most probable cluster GMM", np.mean(gmm_prob_mrr))

Accuracy
most similar 0.09575551782682512
average distance (brute force 1) 0.21765704584040746
analogy (brute force 2) 0.36536502546689303
average distance GMM 0.31646859083191853
dist to center GMM 0.3174872665534805
dist to center weighted GMM 0.31646859083191853
probability of most probable cluster GMM 0.2764006791171477

MRR
most similar 245.5400000000006
average distance (brute force 1) 277.7366666666661
analogy (brute force 2) 344.89999999999935
average distance GMM 324.6966666666661
dist to center GMM 322.93999999999926
dist to center weighted GMM 322.8099999999993
probability of most probable cluster GMM 303.1699999999995
