<a href="https://colab.research.google.com/github/o-fugi/FURSPColexification/blob/main/code/Prediction_with_GMM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/ColabFiles/

Mounted at /content/drive/
/content/drive/MyDrive/ColabFiles


In [2]:
import pandas as pd
import re
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from functools import reduce
import matplotlib as mpl
import torch

In [3]:
# install sentence-trasnformers
%%capture
! pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
model = SentenceTransformer('whaleloops/phrase-bert')

In [8]:
# import semantic shift dataset

sem_shift_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/Project 10 Datasets/cleaned_dat_sem_shift.csv')
sem_shift_df['meaning1'] = sem_shift_df['meaning1_clean']
sem_shift_df['meaning2'] = sem_shift_df['meaning2_clean']

sem_shift_df.at[697, 'meaning1'] = 'furuncul'
sem_shift_df.at[1521, 'meaning2'] = 'geometrid'

shift_class_df = sem_shift_df

## PCA

In [9]:
# create a dictionary for the embeddings
vec_dic = {} # This will be a dictionary that easily allows us to access the embedding for all of our senses, saving time. 
error_senses = set()  # This represents the set of senses for which there was a problem converting them to embeddings or concreteness values

for i in range(len(shift_class_df)): # Here we loop through each row of our dataframe, and if we can convert a sense s to an embedding then we set vec_dic[s] = embedding
  row = shift_class_df.iloc[i]
  x = row["meaning1"]
  y = row["meaning2"]

  try:   
    if x not in vec_dic:
      xvec = np.array(model.encode(x))
      vec_dic[x] = xvec
  except:
    error_senses.add(x)

  try:  
    if y not in vec_dic:
      yvec = np.array(model.encode(y))
      vec_dic[y] = yvec
  except: 
    error_senses.add(y)

error_senses = list(error_senses) # List of all senses that could not be converted to embeddings. Should be empty right now with phrase BERT
senses = list(vec_dic.keys()) # List of all concepts

sense_indices = {senses[i]:i for i in range(len(senses))} # sense_indices is a dictionary where its keys are senses and its values are the indices for which the senses appear in our list of senses.

In [10]:
# create dataframe with a pair of embeddings for each shift

all_vars_df = pd.DataFrame()
all_vars_df['meaning1'] = shift_class_df['meaning1']
all_vars_df['meaning2'] = shift_class_df['meaning2']

# #if working with the English database, these are helpful
# all_vars_df['word'] = shift_class_df['Word']
# all_vars_df['type'] = shift_class_df['Type of change']

vec_df = pd.DataFrame.from_dict(vec_dic, orient='index').reset_index().rename(columns={'index':'Word'})
vec_meaning_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning1',right_on='Word', how='left'), [all_vars_df, vec_df])
vec_meaning_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning2',right_on='Word', how='left'), [vec_meaning_df, vec_df])
vec_meaning_df = vec_meaning_df.drop(['Word_x', 'Word_y'], axis=1)

In [11]:
# create dataframe with one difference embedding for each shift

vec_diff_df = pd.DataFrame()

for i in range(len(model.encode('yikes'))):
  vec_diff_df[i] = vec_meaning_df[str(i) + "_y"] - vec_meaning_df[str(i) + "_x"]

  


In [12]:
# create dataframe with all shifts and difference vectors 

source_shift_df = reduce(lambda  left,right: pd.merge(left,right,left_index=True,right_index=True, how='left'), [all_vars_df, vec_diff_df])

In [259]:
# perform pca, just so the gaussian mixture will perform a little better
# x = source_shift_df.drop(['word', 'meaning1', 'meaning2', 'type'], axis=1).values # if working with English database
x = source_shift_df.drop(['meaning1', 'meaning2'], axis=1).values
scaler = StandardScaler()
scaler.fit(x)
x_scale = scaler.transform(x)

# do PCA
pca = PCA(n_components=50)
pca.fit(x_scale)
components = pca.transform(x_scale)
components_df = pd.DataFrame(data = components)#.rename(columns={0:'PC_1' , 1:'PC_2', 2:"PC_3", 3:'PC_4'})

# merge back into word data
df = reduce(lambda  left,right: pd.merge(left,right, left_index=True, right_index=True), [all_vars_df, components_df])

## GMM 

In [262]:
# group the shifts

n_dimensions = 50
n_components = 17
estimator = GaussianMixture(n_components=n_components, covariance_type='spherical', init_params='kmeans', max_iter=20) # other covariance is "spherical", "diag", "tied"
estimator.fit(df[range(n_dimensions)])



GaussianMixture(covariance_type='spherical', max_iter=20, n_components=17)

In [263]:
label_df = pd.DataFrame(estimator.predict(np.asarray(df[range(50)])))
label_df = label_df.rename(columns={0:'label'})

df = reduce(lambda  left,right: pd.merge(left,right,left_index=True,right_index=True, how='left'), [df, label_df])

## brute force and GMM models

In [277]:
np.inner(train_df[list(range(50))], diff_df)

array([[-80.61279762, -80.61279762, -80.61279762, ..., -80.61279762,
        -80.61279762, -80.61279762],
       [ 88.32587992,  88.32587992,  88.32587992, ...,  88.32587992,
         88.32587992,  88.32587992],
       [-16.05496679, -16.05496679, -16.05496679, ..., -16.05496679,
        -16.05496679, -16.05496679],
       ...,
       [ 16.78483298,  16.78483298,  16.78483298, ...,  16.78483298,
         16.78483298,  16.78483298],
       [ 22.57613166,  22.57613166,  22.57613166, ...,  22.57613166,
         22.57613166,  22.57613166],
       [ 67.31652054,  67.31652054,  67.31652054, ...,  67.31652054,
         67.31652054,  67.31652054]])

In [301]:
np.inner(pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6]]), [10, 1, 2, 17])

array([ 86, 146])

In [296]:
(np.linalg.norm(train_df[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))

array([347.27178998, 233.98153196, 257.49929903, ..., 435.21359055,
       452.39203035, 439.78429491])

In [295]:
diff_df = pd.DataFrame([diff_vec_pca[0]])
diff_df = pd.DataFrame(np.repeat(diff_df.values, len(train_df), axis=0), columns=diff_df.columns)

In [314]:
np.inner(train_df[list(range(50))], diff_vec_pca).T / (np.linalg.norm(train_df[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))

array([[-0.23213172,  0.37749082, -0.06234956, ...,  0.03856689,
         0.04990391,  0.15306713]])

In [310]:
(np.linalg.norm(train_df[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))

array([347.27178998, 233.98153196, 257.49929903, ..., 435.21359055,
       452.39203035, 439.78429491])

In [298]:
np.inner(train_df[list(range(50))], diff_df)

array([[-80.61279762, -80.61279762, -80.61279762, ..., -80.61279762,
        -80.61279762, -80.61279762],
       [ 88.32587992,  88.32587992,  88.32587992, ...,  88.32587992,
         88.32587992,  88.32587992],
       [-16.05496679, -16.05496679, -16.05496679, ..., -16.05496679,
        -16.05496679, -16.05496679],
       ...,
       [ 16.78483298,  16.78483298,  16.78483298, ...,  16.78483298,
         16.78483298,  16.78483298],
       [ 22.57613166,  22.57613166,  22.57613166, ...,  22.57613166,
         22.57613166,  22.57613166],
       [ 67.31652054,  67.31652054,  67.31652054, ...,  67.31652054,
         67.31652054,  67.31652054]])

In [303]:
np.inner(train_df[list(range(50))], diff_vec_pca) / (np.linalg.norm(train_df[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))

array([[-0.23213172, -0.34452633, -0.31306026, ..., -0.18522583,
        -0.17819235, -0.18330076],
       [ 0.25434223,  0.37749082,  0.34301406, ...,  0.20294835,
         0.1952419 ,  0.2008391 ],
       [-0.0462317 , -0.06861638, -0.06234956, ..., -0.03688986,
        -0.03548906, -0.03650646],
       ...,
       [ 0.04833342,  0.07173572,  0.06518399, ...,  0.03856689,
         0.03710241,  0.03816606],
       [ 0.06500998,  0.09648681,  0.08767454, ...,  0.05187368,
         0.04990391,  0.05133456],
       [ 0.19384391,  0.28770014,  0.26142409, ...,  0.15467467,
         0.1488013 ,  0.15306713]])

In [374]:
def brute_force(source, potential_targets, similarity='euclidean'):
  average_dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform
    diff_df = pd.DataFrame([diff_vec_pca[0]])
    diff_df = pd.DataFrame(np.repeat(diff_df.values, len(train_df), axis=0), columns=diff_df.columns)

    if similarity=='cosine':
      dists = np.inner(train_df[list(range(50))], diff_vec_pca).T / (np.linalg.norm(train_df[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))
    else:
      dists = np.linalg.norm(train_df[list(range(50))] - diff_df, axis=1)

    average_dists.append(np.average(dists))
  
  best_target = np.argmin(average_dists)

  return best_target

def dist_from_closest(source, potential_targets, similarity='euclidean'):
  min_dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform
    diff_df = pd.DataFrame([diff_vec_pca[0]])
    diff_df = pd.DataFrame(np.repeat(diff_df.values, len(train_df), axis=0), columns=diff_df.columns)

    if similarity=='cosine':
      dists = np.inner(train_df[list(range(50))], diff_vec_pca).T / (np.linalg.norm(train_df[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))
    else:
      dists = np.linalg.norm(train_df[list(range(50))] - diff_df, axis=1)
    # print(source, "->", target, "is similar to", train_df.iloc[np.argmin(dists)][['meaning1', 'meaning2']].values)
    
    min_dists.append(np.min(np.abs(dists)))
  
  best_target = np.argmin(min_dists)
  
  # print(min_dists)

  return best_target

def gmm_model(source, potential_targets, similarity='euclidean'):
  average_dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform

    # closest cluster, by GMM automatic labelling 
    cluster = estimator.predict(diff_vec_pca)[0]

    # filter train dataset to only include that cluster 
    train_df_source = train_df[train_df['label']==cluster].reset_index()

    diff_df = pd.DataFrame([diff_vec_pca[0]])
    diff_df = pd.DataFrame(np.repeat(diff_df.values, len(train_df_source), axis=0), columns=diff_df.columns)

    if similarity=='cosine':
      dists = np.inner(train_df_source[list(range(50))], diff_vec_pca).T / (np.linalg.norm(train_df_source[list(range(50))], axis=1) * np.linalg.norm(diff_df, axis=1))
    else:
      dists = np.linalg.norm(train_df_source[list(range(50))] - diff_df, axis=1)

    average_dists.append(np.average(dists))
  
  best_target = np.argmin(average_dists)

  return best_target

def gmm_dist_from_center(source, potential_targets, similarity='euclidean'):
  dists = []

  for target in potential_targets:
    diff_vec = vec_dic[target] - vec_dic[source]
    diff_vec_pca = pca.transform(scaler.transform([diff_vec])) # pca transform

    # closest cluster, by GMM automatic labelling 
    cluster = estimator.predict(diff_vec_pca)[0]

    if similarity=='cosine':
      dist = np.inner(estimator.means_[cluster], diff_vec_pca).T / (np.linalg.norm(estimator.means_[cluster]) * np.linalg.norm(diff_vec_pca))
    else:
      dist = np.linalg.norm(estimator.means_[cluster] - diff_vec_pca)

    dists.append(dist)
  
  best_target = np.argmin(dists)

  return best_target

In [375]:
# shuffle df for cross-validation

df = df.sample(frac=1).reset_index(drop=True)
k = 5

# keep track of accuracies
dist_avg = []
dist_closest = []
gmm_avg = []
gmm_center = []

for i in range(k):
  if i != 0: 
    continue
  # set up train and test dfs
  test_df = df.loc[(int(len(df)/k)*i):(int(len(df)/k)*(i+1))]
  train_df = df.drop(test_df.index)

  train_df = train_df.reset_index(drop=True)
  test_df = test_df.reset_index(drop=True)

  num_correct_dist_avg = 0
  num_correct_dist_closest = 0
  num_correct_gmm_avg = 0
  num_correct_gmm_center = 0

  # use cosine similarity / euclidean distance

  for i in range(len(test_df)):
    row = test_df.iloc[i]
    source = row['meaning1']
    
    # make list of potential targets to choose from
    potential_targets = []
    potential_targets.append(row['meaning2']) # the ACTUAL target
    df_targets = df[df['meaning1']!=source] # only take elements of the dataframe that don't have source as meaning1 
    potential_targets += list(df_targets['meaning2'].sample(n=4)) # sample 4 random targets

    average_dists = []

    if brute_force(source, potential_targets, similarity='cosine')==0:
      num_correct_dist_avg +=1

    if dist_from_closest(source, potential_targets, similarity='cosine')==0:
      num_correct_dist_closest +=1
    
    if gmm_model(source, potential_targets, similarity='cosine')==0:
      num_correct_gmm_avg += 1

    if gmm_dist_from_center(source, potential_targets, similarity='cosine')==0:
      num_correct_gmm_center +=1

  dist_avg.append(num_correct_dist_avg)
  dist_closest.append(num_correct_dist_closest)
  gmm_avg.append(num_correct_gmm_avg)
  gmm_center.append(num_correct_gmm_center)

In [318]:
print(np.mean(dist_avg)/len(test_df))
print(np.mean(dist_closest)/len(test_df))
print(np.mean(gmm_avg)/len(test_df))
print(np.mean(gmm_center)/len(test_df))

0.5475866757307953
0.5584636301835486
0.5639021074099252
0.5859959211420802


In [321]:
print(np.mean(dist_avg)/len(test_df))
print(np.mean(dist_closest)/len(test_df))
print(np.mean(gmm_avg)/len(test_df))
print(np.mean(gmm_center)/len(test_df))

0.24133242692046228
0.0
0.2688647178789939
0.23487423521414005
