Overview: We first define all the necessary functions before executing the code in the following order: 1) Load the semantic space 2) Load the data 3) Determine the target word's neighbours 4) Compute the different values of OSC and PSC 5) Check correlation of OSC and PSC with AoA 6) Load all the variables into a final dataframe.    

# Import packages

In [1]:
import math
from matplotlib import pyplot 
from tqdm import tqdm
import pickle
import random
from scipy import stats
from old20 import old20
import json
import numpy as np
import pandas as pd
import semspaces
import itertools
import re
import os
import seaborn as sns
import scipy
from jellyfish import levenshtein_distance
import sklearn
import sklearn.metrics.pairwise as smp
from celex.utilities.dictionaries import tokens2ids
from collections import defaultdict
from semspaces.space import SemanticSpace

# Functions to read the data

## AoA

In [2]:
def read_aoa(path):

    """
    :param path:        the path to the file AoA_ratings_from_all_sources.xlsx from the Age of Acquisition norms
                        collected by Kuperman et al 2012. The exact file can be downloaded here:
                        http://crr.ugent.be/papers/AoA_ratings_from_all_sources.zip
    :return:            the set of words for which age of acquisition norms were collected.
    """

    aoa_norms = pd.read_excel(path, usecols = ["Word","Rating.Mean"])
    aoa_words = set(aoa_norms['Word'])

    return aoa_words, aoa_norms

## Frequency (SUBTLEX-US)

In [3]:
def read_frequency_subtlex(path):

    """
    :param path:            the path to the csv file containing the SUBTLEX-US frequency values 
    :return:                a dict mapping words to frequency values
    """

    df = pd.read_csv(path, header=0)
    df["word"] = df["Word"].str.lower()
    word2freq = pd.Series(df['SUBTLWF'].values, index=df['word']).to_dict()

    return word2freq


## Concreteness

In [4]:
def read_concreteness_norms(path):

    """
    :param path:            the path to the .txt file containing the concreteness norms from Brysbaert et al.
    :return:                a dict mapping words to concreteness scores
    """

    df = pd.read_csv(path, sep='\t', header=0)
    df["word"] = df["Word"].str.lower()
    word2concr = pd.Series(df['Conc.M'].values, index=df['word']).to_dict()

    return word2concr


## Valence

In [5]:
def read_valence_norms(path):

    """
    :param path:            the path to the csv file containing the valence values from Brysbaert et al.
    :return:                a dict mapping words to valence scores
    """

    df = pd.read_csv(path, header=0)
    df["word"] = df["Word"].str.lower()
    word2val = pd.Series(df['V.Mean.Sum'].values, index=df['word']).to_dict()

    return word2val

## OLD20

In [6]:
def read_old20(path):
    """
    :param path:            the path to the csv file containing the OLD20 values
    :return:                a dict mapping words to valence scores
    
    If OLD20 csv is missing, use the function compute_old20() which can be found below. 
    """

    df = pd.read_csv(path, sep=' ', header=None)
    word2old = pd.Series(df[1].values, index=df[0]).to_dict()
    return word2old


In [7]:
def read_iconicity(path):

    """
    :param path:            the path to the csv file containing the SUBTLEX-US frequency values 
    :return:                a dict mapping words to frequency values
    """

    df = pd.read_csv(path, header=0)
    word2iconicity = pd.Series(df['Iconicity'].values, index=df['Word']).to_dict()

    return word2iconicity

# Intersection & Union

In [8]:
def shared_words(*args):

    """
    :param args:    an arbitrary number of sets (other structures are coerced to set)
    :return:        the intersection between the input arguments
    """

    shared = set()
    for i, arg in enumerate(args):
        if arg:
            if not isinstance(arg, set):
                print("Input argument {}, originally of type {}, is coerced to set".format(i, type(arg)))
                arg = set(arg)
            if not shared:
                shared = arg
            else:
                shared = shared.intersection(arg)

    return shared


def union_words(*args):

    """
    :param args:    an arbitrary number of sets (other structures are coerced to set)
    :return:        the union of the input arguments
    """

    union_set = set()
    for i, arg in enumerate(args):
        if arg:
            if not isinstance(arg, set):
                print("Input argument {}, originally of type {}, is coerced to set".format(i, type(arg)))
                arg = set(arg)
            if not union_set:
                union_set = arg
            else:
                union_set = union_set.union(arg)

    return union_set

## Morphological Complexity

In [9]:
def read_mono(path):

    """
    :param path:    str, indicating the path to the morpholex database
    :return:        a set of lower-cased strings indicating the mono-morphemic words found in the morpholex database
    """

    morpholex_df = pd.read_excel(path, sheet_name=1)
    w = set(morpholex_df['MorphoLexSegm'])
    targets = set()
    regex = re.compile('[^a-z]')
    for el in w:
        try:
            targets.add(regex.sub('', el.lower()))
        except AttributeError:
            pass

    return targets


def read_mono_inflected(path):
    
    """
    :param path:    str, indicating the path to the morpholex database
    :return:        a set of lower-cased strings indicating the mono-morphemic words with inflectional morphemes
                    found in the morpholex database
    """

    targets = set()
    inflections = ['s', 'ed', 'ing', 'en', "'s", 'er', 'est', 'es', 'ies', 'ings', 'ied']
    morpholex = pd.read_excel(path, sheet_name=None)
    for name, sheet in morpholex.items():
        if name == '0-1-0':
            token2base = pd.Series(sheet['MorphoLexSegm'].values, index=sheet['Word']).to_dict()
            for token, base in token2base.items():
                if type(token) == int or type(token) == float:
                    continue
                base = base.strip('{{()}}')
                token = token.lower()
                if token != base:
                    inflected = [''.join([base, affix]) for affix in inflections]
                    reduplicated_final = ''.join([token, token[-1]])
                    inflected.extend([''.join([reduplicated_final, affix]) for affix in inflections])
                    if base[-1] == 'f' or token[-2:] == 'fe':
                        f_v_alternation = re.sub('(f|fe)$', 'v', base)
                        inflected.extend([''.join([f_v_alternation, affix]) for affix in inflections])
                    if base[-1] == 'e':
                        no_e = re.sub('e$', '', base)
                        inflected.extend([''.join([no_e, affix]) for affix in inflections])
                    if base[-1] == 'y':
                        no_y = re.sub('y$', '', base)
                        inflected.extend([''.join([no_y, affix]) for affix in inflections])
                    if token == 'vertices':
                        inflected.append(token)
                    if base.endswith('c'):
                        plus_k = base + 'k'
                        inflected.extend([''.join([plus_k, affix]) for affix in inflections])
                    if base[-1] in {'b', 'd', 'f', 'g', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'z'}:
                        reduplicated = base + base[-1]
                        inflected.extend([''.join([reduplicated, affix]) for affix in inflections])
                    if base.endswith('eau'):
                        inflected.extend([''.join([base, affix]) for affix in ['s', 'x']])

                    o_to_ou = re.sub('o[bcdfgklmnpqrstvxz]+$', 'ou', base)
                    inflected.extend([''.join([o_to_ou, affix]) for affix in inflections])
                    if token in inflected:
                        targets.add(token)

    return targets


def read_poly(path):
    """
    :param path:    str, indicating the path to the morpholex database
    :return:        a set of lower-cased strings indicating the poly-morphemic words found in the morpholex database
    """

    targets = set()
    regex = re.compile('[^a-z]')
    morpholex = pd.ExcelFile(path)
    for idx, name in enumerate(morpholex.sheet_names):
        try:
            a, b, c = name.split('-')
            if a != '0' or c != '0':
                sheet = morpholex.parse(name)
                w = set(sheet['MorphoLexSegm'])
                for el in w:
                    try:
                        targets.add(regex.sub('', el.lower()))
                    except AttributeError:
                        pass
        except ValueError:
            pass

    return targets

# Celex 

In [10]:
def get_celex_coverage(words, celex_dict):

    """
    :param words:       a set of target words to be encoded phonologically
    :param celex_dict:  the dictionary derived from the CELEX database
    :return:    - a set containing the words among the targets for which a unique phonological transcription was found
                - a set containing the words among the targets for which more than one phonological transcription was
                    found
                - a set containing the words among the targets for which no phonological transcription was found
    """

    tokens2identifiers = tokens2ids(celex_dict)

    no_phon = set()
    ambiguous = set()
    spoken_words_phonology = set()

    for word in words:
        token_ids = tokens2identifiers[word]
        possible_phonological_transcriptions = set()
        if token_ids:
            for token_id in token_ids:
                possible_phonological_transcriptions.add(celex_dict['tokens'][token_id]['phon'])
            if len(possible_phonological_transcriptions) > 1:
                ambiguous.add((word, tuple(possible_phonological_transcriptions)))
            else:
                for transcription in possible_phonological_transcriptions:
                    spoken_words_phonology.add((word, transcription.replace("-", "")))
        else:
            no_phon.add(word)

    return spoken_words_phonology, ambiguous, no_phon

# Function to create morphological complexity variable

In [11]:
def compute_morph_complexity(shared_words, mono_words, mono_inflected_words, poly_words):
    """
    :param shared_words:    set (or list), the shared words of all data sets 
    :mono_words:            list, monomorphemic words extracted from MorphoLEX
    :mono_inflected_words:  list, monomorphemic words with inflections extracted from MorphoLEX
    :poly_words:            list, polymorphmeic words extracted from MorphoLEX
    :return:                dictionary, words mapped to morphological status (monomorphemic = 0; polymorphemic = 1)
    """
    
    dd = defaultdict(int)
    for word in shared_words:
        if word in mono_words or word in mono_inflected_words:
            dd[word] = 0
        elif word in poly_words:
            dd[word] = 1
        else:
            continue
    return dd 

# Function to compute semantic neighbourhood density

In [12]:
def compute_snd(sem_space, target_words, filter_words, n):
    """
    :sem_space:      the semantic space
    :target_words:   list, target words
    :filter_words:   list, overlap of word lists of other control variables (see filter_words below)
    :n:              int, number of neighbours to consider
    :return:         dictionary, SND value, higher values refer to sparser semantic neighbourhoods)
    """
    d = defaultdict(float)
    for word in tqdm(target_words):
        distances_df = sem_space.all_distances([word])
        filtered_distances_df = distances_df.loc[:, distances_df.columns.isin(filter_words)]
        sorted_distances = filtered_distances_df.loc[word].sort_values(ascending = True)
        d[word] = sum(sorted_distances[1:n+2])/n #index 0 is identical to target word, so skip it
    return d

# Functions to determine neighbours

## Neighbours defined as target-embedded words 

In [13]:
def find_TE_neighbours(targets, reference_words):
    """
    :param targets:         list, target words OR dictionary, target words mapped to phonological represenation
    :param reference_words: list, reference words 
    :return:                dictionary, target words mapped to neighbours (target-embedded words)
    """
    dn = defaultdict(list)
    
    if type(targets) == dict:
        #create phonological representation of reference words
        ref_phon = {k: v for k, v in get_celex_coverage(reference_words, celex)[0]}
        for target, phon_t in tqdm(targets.items()):
            for ref, phon_r in ref_phon.items():
                if phon_t in phon_r and phon_t != phon_r and phon_r not in dn[target]:
                    dn[target].append(ref)
            if dn[target] == []:
                dn[target] = random.sample(reference_words, 15) #19 - average number of neighbours in original set of target words 
            
    else:
        for word in tqdm(targets):
            for ref in reference_words:
                if word in ref and word != ref and ref not in dn[word]:
                    dn[word].append(ref)
            if dn[word] == []:
                dn[word] = random.sample(reference_words, 10) # 13 - average number of neighbours in original set of target words 

    return dn

## Neighbours defined by Levenshtein edit-distance

In [14]:
def find_LD_neighbours(targets, reference_words, d=None, rn=None, kn=None):
    """
    :param targets:         list, target words OR dictionary, target words mapped to phonological representation
    :param reference_words: list, reference words 
    :param d:               int, maximum edit distance to consider
    :param rn:              int, number of random neighbours to consider if none are found within distance d
    :param kn:              int, number of neighbours to consider when d is not predefined
    :return:                dictionary, target words mapped to neighbours with edit distance <= d
    """
    dd = defaultdict(dict)
    dn = defaultdict(list)
    
    if type(targets) == dict:
        #create phonological representation of reference words
        ref_phon = {k: v for k, v in get_celex_coverage(reference_words, celex)[0]}
        for target, phon_t in tqdm(targets.items()):
            for ref, phon_r in ref_phon.items():
                dd[target][ref] = levenshtein_distance(phon_t, phon_r)
            values = [v for w, v in sorted(dd[target].items(), key=lambda item: item[1])]            
            d = values[kn] #not kn-1 because first element in list is identical to target
            
            for word, dis in dd[target].items():
                if dis <= d and dis > 0:
                    dn[target].append((word, dis))
            #if dn[target] == []:
                #dn[target] = random.sample(reference_words, rn)

    else:
        #find neighbours for all words in reference_words 
        for target in tqdm(targets):
            for ref in reference_words:
                dd[target][ref] = levenshtein_distance(target, ref)
            values = [v for w, v in sorted(dd[target].items(), key=lambda item: item[1])]
            d = values[kn]
                
            for word, dis in dd[target].items():
                if dis <= d and dis > 0:
                    dn[target].append((word, dis))
            #if dn[target] == []:
                #dn[target] = random.sample(dd[target].items(), rn) 
                

    return dn

# Functions to compute OSC & PSC

## OSC & PSC - Target embedded neighbours & frequency normalized (Marelli & Amenta)

In [15]:
def compute_OPSC_ma(tn_dict):
    """
    :param tn_dict:    dictionary, target words mapped to list of target-embedded neighbours 
    :return:           dictionary, target words mapped to O/PSC value
    """
    opsc_dict = defaultdict(float)

    for target in tqdm(tn_dict):
        nom = 0
        denom = 0
        target_vector = space.get_vector(target)
        
        for n in set(tn_dict[target]):
            denom += freq_subtlex[n]
            
            try:
                neighbour_vector = space.get_vector(n)
                    
                sim = abs(sklearn.metrics.pairwise.cosine_similarity(target_vector, neighbour_vector))
                nom += sim * freq_subtlex[n]
                
            except ValueError:
                continue  
                
        opsc_dict[target] = float(nom/denom)
    
    return opsc_dict

## OSC & PSC - edit-distance & nr. of neighbours normalized (Hendrix & Sun)

In [16]:
def compute_OPSC_hs(tn_dict):
    """
    :param tn_dict:    dictionary, target words mapped to list of (neighbour, distance) tuples
    :return:           dictionary, target words mapped to O/PSC value
    """
    opsc_dict = defaultdict(int)
    
    for target in tqdm(tn_dict):
        if target == "the":
            continue
        nom = 0
        denom = len(tn_dict[target])
        target_vector = space.get_vector(target)
        
        for n, ed_dis in tn_dict[target]:
            try:
                neighbour_vector = space.get_vector(n)
                    
                sim = abs(sklearn.metrics.pairwise.cosine_similarity(target_vector, neighbour_vector))
                nom += sim * 1/ed_dis
            except ValueError:
                continue
                
        opsc_dict[target] = float(nom/denom)
    
    return opsc_dict


## Function to check correlation with AoA

In [17]:
def check_correlation(targets, d, aoa_dict):
    """
    :param targets:   list, target words
    :param targets:   dictionary, target words mapped to OSC or PSC values
    :aoa_dict:        dictionary, target words mapped to AoA ratings
    :return:          float, pearson correlation between O/PSC and AoA 
    :return:          scatter, scatterpolot of correlation between O/PSC and AoA
    """
    values = []
    aoa = []
    for target in targets:
        values.append(d[target])
        aoa.append(aoa_dict[target])
    scatter = pyplot.scatter(values,aoa)
    return stats.pearsonr(values,aoa), scatter

## Function to create final dataframe

In [18]:
def target_values(d_aoa, d_OSC1, d_OSC2, d_PSC1, d_PSC2, d_conc, d_valence, d_freq_subtlex, d_word2old, d_target_phon, d_morph, d_snd, d_iconicity, targets):
    """
    :param d_*:       all dictionaries mapping words to values of variables of interest
    :param targets:   list, sorted target words 
    :return:          list, values for target words 
    """
    values = []
    for word in targets:
        values.append([word, len(word), len(d_target_phon[word]), d_aoa[word], d_OSC1[word], d_OSC2[word], d_PSC1[word], d_PSC2[word], d_conc[word], d_valence[word], d_freq_subtlex[word], d_word2old[word], d_snd[word], d_iconicity[word], d_morph[word]])

    return values

# Run code

## Load semantic space

In [19]:
space = SemanticSpace.from_csv('space.w2v.gz', prenorm=True)

## Load the data

In [20]:
#load the celex dictionary 
celex = json.load(open('celex_dict.json'))

#load AoA ratings
aoa_words, aoa_norms = read_aoa("AoA.xlsx")
aoa_ratings = pd.Series(aoa_norms["Rating.Mean"].values,index=aoa_norms["Word"]).to_dict()

#load concreteness ratings
conc = read_concreteness_norms("concreteness.txt")
conc_words = set(conc.keys())

#load valence ratings
valence = read_valence_norms("valence.csv")
val_words = set(valence.keys())

#get list of words in the embedding space
w2v_words = space.included_words()
w2v_words_list = list(w2v_words)

#load word frequencies
freq_subtlex = read_frequency_subtlex("subtlex.csv")
freq_subtlex_words = set(freq_subtlex.keys())
freq_subtlex_words_list = [x for x in freq_subtlex_words if str(x) != 'nan']

#load old20 values
word2old = read_old20("word2old.csv")

#get list of monomorphemic and polymorphemic words
mono = list(read_mono("MorphoLEX_en.xlsx"))
poly = list(read_poly("MorphoLEX_en.xlsx"))
mono_inflected = list(read_mono_inflected("MorphoLEX_en.xlsx"))

#load iconicity ratings
iconicity = read_iconicity("iconicity_ratings.csv")

In [21]:
#concatenate multiple sets together for later filtering
filter_words = list(itertools.chain(aoa_words,conc_words,val_words,mono,poly,mono_inflected))

## Filter to create list of target words

In [22]:
#find shared words
shared = shared_words(aoa_words,conc_words,val_words,w2v_words_list,freq_subtlex_words, word2old)

#create morphological complexity variable for shared words
word2morph = compute_morph_complexity(shared, mono, mono_inflected, poly)

#only consider words that have a phonological encoding in CELEX
target_words_phon = {k: v for k, v in get_celex_coverage(word2morph.keys(), celex)[0]}

#create list of target words
target_words = sorted(list(target_words_phon.keys()))

#create dictionary mapping words to AoA ratings 
target_words_aoa = {k: v for k, v in aoa_ratings.items() if k in target_words}

Input argument 3, originally of type <class 'list'>, is coerced to set
Input argument 5, originally of type <class 'dict'>, is coerced to set


In [24]:
#compute semantic neighbourhood density 
snd = compute_snd(space, target_words, filter_words, 20)

100%|██████████| 2032/2032 [20:13<00:00,  1.67it/s]


In [39]:
### target-embedded neighbours (Marelli & Amenta)
#target_neighbours_TE_orth = find_TE_neighbours(target_words, freq_subtlex_words_list)
target_neighbours_TE_orth = pickle.load(open("target_neighbours_TE_orth.pkl","rb"))

#target_neighbours_TE_phon = find_TE_neighbours(target_words_phon, freq_subtlex_words_list)
target_neighbours_TE_phon = pickle.load(open("target_neighbours_TE_phon.pkl","rb"))

"""
f = open("target_neighbours_TE_orth.pkl","wb")
pickle.dump(target_neighbours_TE_orth,f)
f.close()

f = open("target_neighbours_TE_phon.pkl","wb")
pickle.dump(target_neighbours_TE_phon,f)
f.close()
"""

### edit-distance neighbours (Hendrix & Sun)

#target_neighbours_LD5_orth = find_LD_neighbours(target_words, freq_subtlex_words_list, kn=5)
target_neighbours_LD5_orth = pickle.load(open("target_neighbours_LD5_orth.pkl","rb"))


#target_neighbours_LD5_phon = find_LD_neighbours(target_words_phon, freq_subtlex_words_list, kn=5)
target_neighbours_LD5_phon = pickle.load(open("target_neighbours_LD5_phon.pkl","rb"))

"""
f = open("target_neighbours_LD5_orth.pkl","wb")
pickle.dump(target_neighbours_LD5_orth,f)
f.close()
         
f = open("target_neighbours_LD5_phon.pkl","wb")
pickle.dump(target_neighbours_LD5_phon,f)
f.close()
"""

'\nf = open("target_neighbours_LD5_orth.pkl","wb")\npickle.dump(target_neighbours_LD5_orth,f)\nf.close()\n         \nf = open("target_neighbours_LD5_phon.pkl","wb")\npickle.dump(target_neighbours_LD5_phon,f)\nf.close()\n'

In [26]:
### compute OSC values

#OSC_marelli = compute_OPSC_ma(target_neighbours_TE_orth)
OSC_marelli = pickle.load(open("OSC_marelli.pkl", "rb"))
"""
f = open("OSC_marelli.pkl","wb")
pickle.dump(OSC_marelli,f)
f.close()
"""

#OSC_hendrix = compute_OPSC_hs(target_neighbours_LD5_orth)
OSC_hendrix = pickle.load(open("OSC_hendrix.pkl", "rb"))

"""
f = open("OSC_hendrix.pkl","wb")
pickle.dump(OSC_hendrix,f)
f.close()
"""

### compute PSC values

#PSC_marelli = compute_OPSC_ma(target_neighbours_TE_phon)
PSC_marelli = pickle.load(open("PSC_marelli.pkl", "rb"))

"""
f = open("PSC_marelli.pkl","wb")
pickle.dump(PSC_marelli,f)
f.close()
"""

#PSC_hendrix = compute_OPSC_hs(target_neighbours_LD5_phon)
PSC_hendrix = pickle.load(open("PSC_hendrix.pkl", "rb"))

"""
f = open("PSC_hendrix.pkl","wb")
pickle.dump(PSC_hendrix,f)
f.close()
"""

'\nf = open("PSC_hendrix.pkl","wb")\npickle.dump(PSC_hendrix,f)\nf.close()\n'

## Preliminary data analysis - check correlation with AoA

In [None]:
### OSC Marelli & Amenta
print(check_correlation(target_words, OSC_marelli,target_words_aoa))

In [None]:
### OSC Hendrix & Sun
print(check_correlation(target_words, OSC_hendrix,target_words_aoa))

In [None]:
### PSC Marelli & Amenta
print(check_correlation(target_words, PSC_marelli,target_words_aoa))

In [None]:
### PSC Hendrix & Sun 
print(check_correlation(target_words, PSC_hendrix,target_words_aoa))

## Create final dataframe to export as csv

In [28]:
values = target_values(aoa_ratings, OSC_marelli, OSC_hendrix, PSC_marelli, PSC_hendrix, conc, valence, freq_subtlex, word2old, target_words_phon, word2morph, snd, target_words)

final_df = pd.DataFrame(data=values, columns = ["word", "word length", "nr_phon", "aoa", "OSC_m", "OSC_h", "PSC_m", "PSC_h", "conc", "val", "freq_subtlex", "old20", "snd","morph"])

final_df.to_csv("thesis_data.csv")

# Alternative OSC and PSC definitions 

## OSC & PSC 2 - target embedded & nr. of neighbours normalized

In [None]:
def compute_OPSC2(tn_dict):
    """
    :param tn_dict:    dictionary mapping target words to list of target-embedded neighbours 
    :return:           dictionary mapping target words to OPSC value
    """
    opsc_dict = defaultdict(int)

    for target in tqdm(tn_dict):
        nom = 0
        denom = len(tn_dict[target])
        for n in tn_dict[target]:
            try:
                nom += space.pair_distance(target, n) 
            except ValueError:
                continue

        opsc_dict[target] = nom/denom
    
    return opsc_dict

## OSC & PSC 3 - LD neighbours & frequency normalized

In [None]:
def compute_OPSC3(tn_dict):
    """
    :param tn_dict:    dictionary mapping target keys to list of (neighbour, distance) tuples
    :return:           dictionary mapping target words to OPSC value
    """
    opsc_dict = defaultdict(int)

    for target in tqdm(tn_dict):
        nom = 0
        denom = 0
        for n, dis in tn_dict[target]:
            denom += freq_subtlex[n]
            try:
                nom += space.pair_distance(target, n) * freq_subtlex[n] * 1/dis
            except ValueError:
                continue

        opsc_dict[target] = nom/denom        
    
    return opsc_dict

# Random Baseline Robustness Check

In [59]:
true_word_vectors = space.word_vectors_matrix(space.included_words())

In [60]:
random_vectors = np.random.permutation(true_word_vectors)