<a href="https://colab.research.google.com/github/o-fugi/FURSPColexification/blob/main/code/Predicting_Semantic_Direction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/ColabFiles/

Mounted at /content/drive/
/content/drive/MyDrive/ColabFiles


In [2]:
import pandas as pd
import re
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from functools import reduce

In [115]:
# import factor  datasets

# https://saifmohammad.com/WebPages/nrc-vad.html
val_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/NRC-VAD-Lexicon.txt', sep="\t", header=None)
val_df = val_df.rename(columns={0:'Word', 1:'val', 2:'aro', 3:'dom'})[['Word', 'val', 'aro', 'dom']]

conc_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/brysbaert_concreteness.csv')
conc_df = conc_df.rename(columns={'Conc.M': 'conc'})[['Word', 'conc']]

freq_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/COCA_freqs.csv', encoding='ISO-8859-1') # w1, coca_spok
freq_df = freq_df.rename(columns={'w1':'Word', 'coca_spok':'freq'})[['Word', 'freq']]

In [116]:
# import semantic shift dataset

sem_shift_df = pd.read_csv('/content/drive/MyDrive/ColabFiles/Project 10 Datasets/cleaned_dat_sem_shift.csv')
sem_shift_df['meaning1'] = sem_shift_df['meaning1_clean']
sem_shift_df['meaning2'] = sem_shift_df['meaning2_clean']

sem_shift_df.at[697, 'meaning1'] = 'furuncul'
sem_shift_df.at[1521, 'meaning2'] = 'geometrid'

In [121]:
# define how we map a sense to concreteness, valence, frequency values

skip_words = {'the', 'an', 'a', 'of','in', 'it', 'to', 'for', 'on', 'in'} # don't factor these into concreteness

# take all words in the sense and and average/add values
# if conservative, only matches to full phrases separated by "or"
def senseToValue(s, df, col, average=True, conservative=False):
  s = s.lower()
  if conservative: 
    phrases = re.split(' or ', s)
  else:
    for i in "(),/":
      s = s.replace(i, "")
    phrases = re.split(' ', s)
  total_val = 0
  val_data = 0
  for p in phrases:
    if p in skip_words:
      continue
    try:
      val = df[df['Word']==p][col].values[0]
      if not np.isnan(val):
        total_val += val
        val_data += 1
    except:
      pass
  if val_data == 0:
    total_val = np.nan
  else:
    if average:
      total_val /= val_data
  return total_val

In [130]:
# get priors for semshift data

# dictionary for concreteness and embeddings
conc_dic = {} # This will be a dictionary that easily allows us to access the concreteness for all of our senses, saving time. 
freq_dic = {} 
val_dic = {} 

conservative=True

for i in range(len(sem_shift_df)): # Here we loop through each row of our dataframe, and if we can convert a sense s to an embedding then we set vec_dic[s] = embedding
  row = sem_shift_df.iloc[i]
  x = row["meaning1"]
  y = row["meaning2"]

  if x not in conc_dic:
    x_conc = senseToValue(x, conc_df, 'conc', conservative=conservative)
    if not np.isnan(x_conc):
      conc_dic[x] = x_conc

  if y not in conc_dic:
    y_conc = senseToValue(y, conc_df, 'conc', conservative=conservative)
    if not np.isnan(y_conc):
      conc_dic[y] = y_conc

  if x not in freq_dic:
    x_freq = senseToValue(x, freq_df, 'freq', average=False, conservative=conservative)
    if not np.isnan(x_freq):
      freq_dic[x] = x_freq

  if y not in freq_dic:
    y_freq = senseToValue(y, freq_df, 'freq', average=False, conservative=conservative)
    if not np.isnan(y_freq):
      freq_dic[y] = y_freq

  if x not in val_dic:
    x_val = senseToValue(x, val_df, 'val', conservative=conservative)
    if not np.isnan(x_val):
      val_dic[x] = x_val

  if y not in val_dic:
    y_val = senseToValue(y, val_df, 'val', conservative=conservative)
    if not np.isnan(y_val):
      val_dic[y] = y_val

In [131]:
# test prior accuracy

# merge concreteness with shift df
conc_sense_df = pd.DataFrame.from_dict(conc_dic, orient='index').reset_index().rename(columns={'index':'Word', 0:'conc'})
shift_conc_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning1',right_on='Word', how='left'), [sem_shift_df, conc_sense_df])
shift_conc_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning2',right_on='Word', how='left'), [shift_conc_df, conc_sense_df])

# calculate ratio and find accuracy
shift_conc_df['ratio'] = shift_conc_df['conc_x'] / shift_conc_df['conc_y']
shift_conc_df = shift_conc_df[~np.isnan(shift_conc_df['ratio'])]
shift_conc_df['ratio_accurate'] = shift_conc_df['ratio'] > 1
print(shift_conc_df['ratio_accurate'].value_counts())
print("accuracy: ", shift_conc_df['ratio_accurate'].value_counts()[True] / len(shift_conc_df), "with ", len(shift_conc_df) , "samples")

True     770
False    278
Name: ratio_accurate, dtype: int64
accuracy:  0.7347328244274809 with  1048 samples


In [132]:
# test prior accuracy

# merge freqreteness with shift df
freq_sense_df = pd.DataFrame.from_dict(freq_dic, orient='index').reset_index().rename(columns={'index':'Word', 0:'freq'})
shift_freq_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning1',right_on='Word', how='left'), [sem_shift_df, freq_sense_df])
shift_freq_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning2',right_on='Word', how='left'), [shift_freq_df, freq_sense_df])

# calculate ratio and find accuracy
shift_freq_df['ratio'] = shift_freq_df['freq_x'] / shift_freq_df['freq_y']
shift_freq_df = shift_freq_df[~np.isnan(shift_freq_df['ratio'])]
shift_freq_df['ratio_accurate'] = shift_freq_df['ratio'] > 1
print(shift_freq_df['ratio_accurate'].value_counts())
print("accuracy: ", shift_freq_df['ratio_accurate'].value_counts()[True] / len(shift_freq_df), "with ", len(shift_freq_df) , "samples")

True     555
False    458
Name: ratio_accurate, dtype: int64
accuracy:  0.5478775913129319 with  1013 samples


In [133]:
# test prior accuracy

# merge valreteness with shift df
val_sense_df = pd.DataFrame.from_dict(val_dic, orient='index').reset_index().rename(columns={'index':'Word', 0:'val'})
shift_val_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning1',right_on='Word', how='left'), [sem_shift_df, val_sense_df])
shift_val_df = reduce(lambda  left,right: pd.merge(left,right,left_on='meaning2',right_on='Word', how='left'), [shift_val_df, val_sense_df])

# calculate ratio and find accuracy
shift_val_df['ratio'] = shift_val_df['val_x'] / shift_val_df['val_y']
shift_val_df = shift_val_df[~np.isnan(shift_val_df['ratio'])]
shift_val_df['ratio_accurate'] = shift_val_df['ratio'] > 1
print(shift_val_df['ratio_accurate'].value_counts())
print("accuracy: ", shift_val_df['ratio_accurate'].value_counts()[True] / len(shift_val_df), "with ", len(shift_val_df) , "samples")

True     492
False    397
Name: ratio_accurate, dtype: int64
accuracy:  0.5534308211473565 with  889 samples


## accuracy for concreteness, frequency, and valence

#### on cleaned data
NOT conservative:
* 68.5 with 2849 samples
* 51.6 with 2874 samples
* 50.6 with 2631 samples

conservative:
* 73.5 with 1048 samples
* 54.8 with 1013 samples
* 55.3 with 889 samples