In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/ColabFiles/

Mounted at /content/drive/
/content/drive/MyDrive/ColabFiles


In [None]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from functools import reduce

In [None]:
# create master dataframe with all frequency, concreteness, aoa, valence information

freqDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/COCA_freqs.csv', encoding='ISO-8859-1') # w1, coca_spok
concreteDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/brysbaert_concreteness.csv') # Word, Conc.M
aoaDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/BristolNorms+GilhoolyLogie.csv') # WORD, AoA (100-700)
valenceDF = pd.read_csv('/content/drive/MyDrive/ColabFiles/Ratings_Warriner_et_al.csv') # Word, V.Mean.Sum [or A for arousal, D for dominance]

freqDF.rename(columns = {'w1':'Word'}, inplace = True)
aoaDF.rename(columns = {'WORD':'Word'}, inplace = True)

allVarsDF = reduce(lambda  left,right: pd.merge(left,right,on='Word',
                                            how='outer'), [freqDF, concreteDF, aoaDF, valenceDF]) # outer merge on "word", so lots of NaNs

allVarsDF = allVarsDF.set_index('Word')

# create smaller dataframe with only relevant info
filteredVarsDF = allVarsDF[['coca_spok','Conc.M', 'AoA (100-700)', 'V.Mean.Sum']].copy()
filteredVarsDF.rename(columns = {
    'coca_spok' : 'Freq',
    'Conc.M' : 'Conc',
    'AoA (100-700)': 'AoA',
    'V.Mean.Sum' : 'Val'
}, inplace=True)

In [None]:
# adjust valence rating because "high valence" is 1 or 7, "low valence" is 4 

filteredVarsDF['ValRate'] = abs(filteredVarsDF['Val']-4)

In [None]:
# normalize with softmax

def softmax_with_nan(col):
  col = np.nan_to_num(col, nan=-np.inf)
  return softmax(col)

filteredVarsDF['normalized_freq'] = np.where(np.isnan(filteredVarsDF['Freq']), np.nan, softmax_with_nan(np.log(filteredVarsDF['Freq'].to_numpy()))) # log is necessary because otherwise it's just [1, 0, 0, 0, ...]
filteredVarsDF['normalized_conc'] = np.where(np.isnan(filteredVarsDF['Conc']), np.nan, softmax_with_nan(filteredVarsDF['Conc']))
filteredVarsDF['normalized_aoa'] = np.where(np.isnan(filteredVarsDF['AoA']), np.nan, softmax_with_nan(filteredVarsDF['AoA']))
filteredVarsDF['normalized_val'] = np.where(np.isnan(filteredVarsDF['ValRate']), np.nan, softmax_with_nan(filteredVarsDF['ValRate']))

  import sys


In [None]:
# naive weights
# not weighted correctly. weighted totally randomly. deal with this when we have datsemshift data
# vars = ['normalized_freq', 'normalized_conc', 'normalized_aoa', 'normalized_val']
beta = [.5, .3, .1, .1]

filteredVarsDF['naive_dist'] = beta[0]*filteredVarsDF['normalized_freq'] + beta[1] * filteredVarsDF['normalized_conc'] # + beta[2] * filteredVarsDF['normalized_aoa'] + beta[3] *filteredVarsDF['normalized_val']

In [None]:
# just checking to see how many words have both concreteness and frequency values

filteredVarsDF[~np.isnan(filteredVarsDF['naive_dist'])]

Unnamed: 0_level_0,Freq,Conc,AoA,Val,normalized_freq,normalized_conc,normalized_aoa,normalized_val,naive_dist,ValRate
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
the,4433575.0,1.43,,,5.139500e-02,0.000002,,,2.569824e-02,
and,2493266.0,1.52,,,2.890250e-02,0.000003,,,1.445207e-02,
of,2054930.0,1.67,,,2.382121e-02,0.000003,,,1.191155e-02,
a,2045436.0,1.46,,,2.371115e-02,0.000003,,,1.185634e-02,
a,9.0,1.46,,,1.043300e-07,0.000003,,,8.198759e-07,
...,...,...,...,...,...,...,...,...,...,...
perseveration,0.0,2.10,,,0.000000e+00,0.000005,,,1.455949e-06,
answerer,2.0,3.56,,,2.318445e-08,0.000021,,,6.280850e-06,
cufflink,0.0,4.86,,,0.000000e+00,0.000077,,,2.300377e-05,
cornhusk,0.0,4.83,,,0.000000e+00,0.000074,,,2.232390e-05,


In [None]:
# testing some random datsemshift examples

# returns p(s1) / p(s2)
def sourceLikely(s1, s2):
  x=filteredVarsDF[filteredVarsDF.index == s1].reindex().iloc[0]['naive_dist']
  y=filteredVarsDF[filteredVarsDF.index == s2].reindex().iloc[0]['naive_dist']
  return x/y
  
# first word is source, second word is target in some datsemshift examples
print(sourceLikely("moon", "month")) 
print(sourceLikely("hear", "obey")) 
print(sourceLikely("sun", "day")) # incorrectly guessing because "day" is very frequent
print(sourceLikely("grasp", "understand")) # incorrectly guessing because "understand" is very frequent
print(sourceLikely("glass", "mirror")) 

0.3674488802890861
32.84775249668098
0.08972759072269751
0.046071983754726396
1.0790244592129496
