In [None]:
import sys
import os
import random
import string
import math
import pickle
import itertools
from collections import Counter, defaultdict

#import torch
#from transformers import *

import numpy as np

import scipy.interpolate.interpnd
from nltk.tokenize import sent_tokenize
from scipy.spatial.distance import cosine
from scipy import stats

In [3]:
def make_grammar(sent,target_words):
    morph_properties = {w: {} for w in target_words}
    syntax_properties = {w: {} for w in target_words}
    for line in sent:
        if not line.strip():
            continue
        if line.startswith("# "):
            continue
        (
            identifier,
            form,
            lemma,
            pos,
            xpos,
            feats,
            head,
            rel,
            enh,
            misc,
        ) = line.strip().split("\t")
        if lemma in target_words:
            if target_words[lemma]:
                if pos != target_words[lemma]:
                    continue
            if feats not in morph_properties[lemma]:
                morph_properties[lemma][feats] = 0
            morph_properties[lemma][feats] += 1
            if rel not in syntax_properties[lemma]:
                syntax_properties[lemma][rel] = 0
            syntax_properties[lemma][rel] += 1
    morph_properties = [{key:val} for key, val in morph_properties.items() if val]
    syntax_properties = [{key:val} for key, val in syntax_properties.items() if val]

    return(morph_properties, syntax_properties)

In [4]:
def get_sent_profiles(file, target_words):
    scores = []
    scores_file = open(file, 'r', encoding='utf8')
    line_list = list(scores_file.readlines())
    i = 0
    while i < len(line_list):
        scores.append(line_list[i])
        i += 1
    sents=[]
    for i,line in enumerate(scores):
        if not line.strip():
            continue

        if line.startswith("# ") and line.find('sent_id')!=-1:
            sent_id = line.split()[-1]
            sents.append(i)
    scores_arr  = []
    idx = 0
    while idx+1 < len(sents):
        cur_val = sents[idx]
        next_val = sents[idx+1]
        sent = scores[cur_val+2:next_val]
        scores_arr.append(sent)
        idx+=1
    sent_profiles = {}
    for i,sent in enumerate(scores_arr):
        morph_properties, syntax_properties = make_grammar(sent, target_words)
        if morph_properties:
            sent_profiles['1.'+str(i+1)] = morph_properties
    word_stat = {w: {'sents':[],'features':{}} for w in target_words}
    for idx, num in enumerate(sent_profiles):
        profile = sent_profiles[num]
        for prof in profile:
                for key, value in prof.items():
                    for val in value:
                        if word_stat[key]['features'].get(val):
                            word_stat[key]['features'][val]+=1
                            word_stat[key]['sents'].append(num)
                        else:

                            word_stat[key]['features'][val]=1
                            word_stat[key]['sents'].append(num)
    sent_data = {w:word_stat[w]['sents'] for w in word_stat}
    return sent_data, sent_profiles

In [5]:
old_sent_data, old_sent_profiles = get_sent_profiles('kk_parsed_old_ES.conllu', target_words)
modern_sent_data, modern_sent_profiles = get_sent_profiles('kk_parsed_modern_ES.conllu', target_words)

In [6]:
bad_chars = set(string.punctuation) - set("_")

def clean(s) :
    global bad_chars

    #s = s.replace('_nn','').replace('_vb','')

    if "'" in s :
        return "" # ignore contractions
    if "’" in s :
        return "" # ignore contractions
    #if "-" in s :
    #    return "" # ignore hyphenated words
    if s != 'f5' :
        for n in string.digits :
            if n in s :
                return "" # ignore anything with a number
    return ''.join([ c if c not in bad_chars else '' for c in s ])

# returns [(string,[w0,w1,...wN]),(string,[w0,w1,...wN]),...]
def read_data(fname) :
    sentences = []
    count = 0
    rejected = 0
    stop = False
    print("reading {} ...".format(fname))
    with open(fname, encoding="utf8") as f :
        for line in f :
            line = line.strip()
            if not line : continue
            #for sent in sent_tokenize(line) :
            if 1 :
                sent = line
                words = sent.split()
                #if len(words) < 5 :
                #    rejected += 1
                #    continue
                words = [ clean(w) for w in words ]
                words = set([ w for w in words if (w != '') ])
                sentences.append((sent,words))
                count += 1
    print("  - read {} sentences (rejected {})".format(count, rejected))
    return sentences


# In[3]:


def process_sentences(corpus, tokenizer, word_list) :
    for text,words in corpus :
        if not any([ w in words for w in word_list]) :
            continue

        text = text.replace('_nn', '').replace('_vb', '')

        tokens = tokenizer.tokenize("[CLS] " + text + " [SEP]")
        tokens = [ t for t in tokens if t != '[UNK]' ] # emoji are [UNK]
        if len(tokens) > 512 :
            continue
        yield tokens,tokenizer.convert_tokens_to_ids(tokens)


# In[4]:


def find_tokens2(tokens, words_list) :
    tmp = []
    word = ""
    offset = 1 # we need to see a word without '##', so offset is always at least 1

    for i in range(len(tokens)) :
        current = tokens[i]
        if current.startswith("##") :
            word += current.replace("##", "")
            offset += 1
        else :
            if word and (word in words_list) :
                tmp.append((i-offset, word))
            offset = 1
            word = current

    if word and (word in words_list) :
        tmp.append((i-offset, word))

    return tmp


# In[5]:


def mk_batch(tokens_list) :
    mx = max([ len(t) for t in tokens_list ])
    tokens = np.array([ t + ([0] * (mx-len(t))) for t in tokens_list ])
    mask = np.where(tokens != 0, 1, 0)
    return torch.LongTensor(tokens), torch.LongTensor(mask)


# In[6]:


def process_batch(token_batch, ids_batch, model, tokenizer, words_counters) :
    tokens,segments = mk_batch(ids_batch)
    tokens = tokens.to("cuda")      # gpu
    segments = segments.to("cuda")  # gpu

    with torch.no_grad() :
        _, _, hidden_states = model(tokens, segments)

    hidden_states = torch.stack(hidden_states, dim=0)
    hidden_states = hidden_states.permute(1,2,0,3)

    embeddings = []
    # sum the last 4 hidden layers
    # to extract sub-word embeddings
    for index,sentence in enumerate(hidden_states) :
        for i,w in  find_tokens2(token_batch[index], words_counters) :
            wt = tokenizer.tokenize(w)

            sum_vec = torch.sum(sentence[i][-4:], dim=0)

            for j in range(1, len(wt)) :
                sum_vec += torch.sum(sentence[i+j][-4:], dim=0)

            sum_vec /= len(wt)
            sum_vec = sum_vec.cpu().numpy()

            embeddings.append((w, sum_vec))
            words_counters[w] -= 1

    return embeddings


# In[7]:


def avg_embeddings(emb) :
    return np.sum(emb, axis=0) / len(emb)


# In[8]:


def get_wordcount(sentences) :
    c = Counter()
    for sentence in sentences :
        text,words = sentence
        c.update(words)
    return c


# In[9]:


def get_wordlist(fname) : #wc1, wc2) :
    #return list(set(wc1).intersection(set(wc2)))
    tmp = []
    with open(fname) as f :
        f.readline() # header
        for line in f :
            line = line.strip().split(',')[1]
            tmp.append(line)
    return tmp

# In[10]:


def extract_embeddings(sentences, word_list, max_word, emb_fname, model, tokenizer, max_batch) :
    random.shuffle(sentences)
    words_counters = dict([ (w.split('_')[0], max_word) for w in word_list ])
    token_batch = []
    ids_batch = []
    embeddings = []
    count = 0
    total_words = len(word_list)
    
    for tokens,ids in process_sentences(sentences, tokenizer, word_list) :
        token_batch.append(tokens)
        ids_batch.append(ids)
        
        if len(token_batch) < max_batch :
            continue
        
        embeddings.extend(process_batch(token_batch, ids_batch, model, tokenizer, words_counters))
        count += 1
        token_batch = []
        ids_batch = []
        
        to_delete = []
        for w in words_counters :
            if words_counters[w] <= 0 :
                to_delete.append(w)
        for w in to_delete :
            del words_counters[w]
        print("\rbatch {} - progress={}/{}".format(count, len(words_counters), total_words), end="", flush=True)
        
    if len(token_batch) > 0 :
        embeddings.extend(process_batch(token_batch, ids_batch, model, tokenizer, words_counters))
    
    print("done!")

    print("\nsyncing to disk...")
    with open(emb_fname, 'wb') as f :
        pickle.dump(embeddings, f)
        
    print("done!")

    return embeddings

# In[11]:


def load_embeddings(fname) :
    with open(fname, 'rb') as f :
        tmp = pickle.load(f)
        
    tmp2 = defaultdict(list)
    for word,vector in tmp :
        tmp2[word].append(np.array(vector))
        
    print("read {} embeddings for {} words ({})".format(len(tmp), len(tmp2), fname))
    
    for word,vectors in tmp2.items() :
        tmp2[word] = np.array(vectors)
        
    return tmp2


# In[12]:


def num_combinations(n, r) :
    try :
        return math.factorial(n) / (math.factorial(r) * math.factorial(n - r))
    except :
        return 1e6

In [7]:
wordlist_fname = 'target_words_evaluation_phase2.txt'

#results_fname = '3_28.03_permutation_results.txt' # 'shift_simulation_6_results.txt'

In [8]:
words = open('target_words_evaluation_phase2.txt','r', encoding = 'utf-8').read().split('\n')
target_words = {w:[] for w in words}
target_words

{'': [],
 'actitud': [],
 'ataque': [],
 'atrás': [],
 'ausencia': [],
 'avance': [],
 'banco': [],
 'canal': [],
 'capital': [],
 'cobrar': [],
 'colaborar': [],
 'compasión': [],
 'copiar': [],
 'corriente': [],
 'cólera': [],
 'declinar': [],
 'demá': [],
 'diligencia': [],
 'disco': [],
 'distribuir': [],
 'educado': [],
 'elocuente': [],
 'encargado': [],
 'enterar': [],
 'especulación': [],
 'fallar': [],
 'fallecimiento': [],
 'historia': [],
 'historiador': [],
 'impulso': [],
 'indicativo': [],
 'juguete': [],
 'maduro': [],
 'maravilloso': [],
 'marco': [],
 'matiz': [],
 'metal': [],
 'metro': [],
 'modificado': [],
 'médula': [],
 'nombrar': [],
 'pendiente': [],
 'pila': [],
 'planta': [],
 'prima': [],
 'propiamente': [],
 'próximo': [],
 'recomendar': [],
 'recordar': [],
 'retroceder': [],
 'satélite': [],
 'socialista': [],
 'solicitud': [],
 'susceptible': [],
 'tarea': [],
 'trato': [],
 'tropical': [],
 'variedad': [],
 'viernes': [],
 'visita': [],
 'vuestro': []}

In [9]:
def collect_word_properties(properties):
    props = defaultdict(lambda: defaultdict(int))
    for features, count in properties.items():
        separate_features = features.split("|")
        for feat in separate_features:
            try:
                k, v = feat.split("=")
            except ValueError:
                continue
            else:
                props[k][v] += count
    return props

In [10]:
def compute_distance_from_common_features(p1, p2, threshold, distance_type):
    features = find_features(p1, p2, threshold)
    vector_1, vector_2 = make_vectors(features, p1, p2)
    return compute_distance(vector_1, vector_2, distance_type)

In [11]:
def make_vectors(features, p1, p2):
    vector_1 = np.zeros(len(features))
    vector_2 = np.zeros(len(features))

    for nr, feature in enumerate(features):
        vector_1[nr] = p1.get(feature, 0)
        vector_2[nr] = p2.get(feature, 0)

    return vector_1, vector_2

In [12]:
def compute_distance(vector_1, vector_2, distance_type):
    if distance_type == "cos":
        dist = cosine(vector_1, vector_2)
        if np.isnan(dist):
            return 0.0
        else:
            return dist
    elif distance_type == "jsd":
        return jensenshannon(vector_1, vector_2)
    else:
        raise NotImplementedError(f"Unknown distance: {distance_type}")

In [13]:
def find_features(p1, p2, threshold):
    features = list(p1.keys() | p2.keys())
    prop_count = {k: p1.get(k, 0) + p2.get(k, 0) for k in features}
    total = sum(prop_count.values())
    return [f for f in features if prop_count[f] / total * 100 > threshold]

In [14]:
properties_1 = json.load(open('kk_old_ES_morph.json', "r", encoding = 'utf-8'))
properties_2 = json.load(open('kk_modern_ES_morph.json', "r", encoding = 'utf-8'))

In [15]:
def permutation_test(g1, g2, dist,word) :
    top_half = len(g1)
    g = np.concatenate((g1, g2), axis=0)
    tmp = []
    if num_combinations(len(g), top_half) < 1000 :
        return dist,exact_pval(g1,g2,dist,word)

    else :
        for _ in range(1000) :
            
            np.random.shuffle(g)
            perm1 = g[: top_half ]
            perm2 = g[top_half : ]
            prof1 =get_new_stats(perm1,word)
            prof2 = get_new_stats(perm2,word)
            tmp.append(find_dist(prof1,prof2))

        pval = sum([ 1 for i in tmp if i >= dist ]) / len(tmp)
        if pval > 0.05 :
            return dist,pval

        if num_combinations(len(g), top_half) < 10000 :
            return dist,exact_pval(g1,g2,dist,word) 
        for _ in range(9000) :
            
            np.random.shuffle(g)
            perm1 = g[: top_half ]
            perm2 = g[top_half : ]
            prof1 =get_new_stats(perm1,word)
            prof2 = get_new_stats(perm2,word)
            tmp.append(find_dist(prof1,prof2))

        pval = sum([ 1 for i in tmp if i >= dist ]) / len(tmp)
        if pval > 0.005 :
            return dist,pval


        if num_combinations(len(g), top_half) < 100000 :
            return dist,exact_pval(g1,g2,dist,word)
        for _ in range(90000) :
            np.random.shuffle(g)
            perm1 = g[: top_half ]
            perm2 = g[top_half : ]
            prof1 =get_new_stats(perm1,word)
            prof2 = get_new_stats(perm2,word)
            tmp.append(find_dist(prof1,prof2))

        pval = sum([ 1 for i in tmp if i >= dist ]) / len(tmp)
        if pval == 0.0 :
            pval = 1 / 100000.0
        return dist,pval 


In [16]:
def exact_pval(g1, g2, dist,word) :
    #print('exact_pval')
    top_half = len(g1)
    g = np.concatenate((g1, g2), axis=0)
    tmp = []
    #print('num itertools: ',len(list(itertools.combinations(range(len(g)), top_half))))
    #print('comb_example: ', list(itertools.combinations(range(len(g)), top_half))[0])
    for ordering in itertools.combinations(range(len(g)), top_half) :
        perm1 = np.take(g, ordering, axis=0)
        perm2 = np.delete(g, ordering, axis=0)
        prof1 =get_new_stats(perm1,word)
        prof2 = get_new_stats(perm2,word)
        tmp.append(find_dist(prof1,prof2)) 
    pval = sum([ 1 for i in tmp if i >= dist ]) / len(tmp)
    #print('pval: ', pval)
    return pval

In [17]:
def feature_separation(word_properties):
    properties = defaultdict(int)
    for el in word_properties:
        for feat in el.split("|"):
            properties[feat] += word_properties[el]

    return properties

In [18]:
def get_new_stats(perm2,word):
    #print('get_new_stats for ', perm2)
    feature_arr = []
    for sent in perm2:
        profile_dic = {}
        corp = sent.split('.')[0]
        if corp == '1':
            sentence = old_sent_profiles[sent]
            for w in sentence:
                if word in w.keys():
                    features = w[word]
                    feature_arr.append(features)
        if corp == '2':
            sentence = modern_sent_profiles['1.'+ sent.split('.')[1]]
            for w in sentence:
                if word in w.keys():
                    features = w[word]
                    feature_arr.append(features)
    #print('feature_arr', feature_arr)
                
    sep_arr = []
    for f in feature_arr:
        f_sep = feature_separation(f)
        sep_arr.append(f_sep)
    #print('sep_arr ', sep_arr)
    common_features = []
    for s in sep_arr:
        common_features.extend(list(s.keys()))
    common_features = set(common_features)
    #print('common_features ', common_features)
    new_dic = {}
    for f in common_features:
        res = 0
        for sep in sep_arr:
            res+=sep.get(f,0)
        new_dic[f] = res
    #print('\n\n\n',new_dic,'\n\n\n')
    return new_dic


In [19]:
def find_dist(p1,p2):
    distance = {}
    p1 = collect_word_properties(p1)
    p2 = collect_word_properties(p2)
    #print('p1', p1)
    #print('p2', p2)
    #print('\n')
    feature_classes = list(p1.keys() | p2.keys())
    
    for f_class in feature_classes:
        #print('f_class ',f_class)
        #print('\n')
        distance[f_class] = \
            compute_distance_from_common_features(p1[f_class], p2[f_class], 0, 'cos')
        #print(distance[f_class])
        #print('\n')
    distance = [d for d in distance.values() if not np.isnan(d)]
    avg_dist = np.mean(distance)
    
    return avg_dist

In [None]:
count = 1
data = {}
print("calculating statistics ...")
for idx,w in enumerate(words):   
    distance = {}
    total_words = len(words)
    print("  - {} ({} / {})          ".format(w, idx, total_words))
    
    p1 = collect_word_properties(properties_1[w])
    p2 = collect_word_properties(properties_2[w])
    #print('p1:',p1,'\np2:',p2)
    feature_classes = list(p1.keys() | p2.keys())
    #print(feature_classes)
    for f_class in feature_classes:
        distance[f_class] = \
            compute_distance_from_common_features(p1[f_class],
                                                  p2[f_class],
                                                  0,
                                                  'cos')
        #print('dist',distance[f_class] )
    distance = [d for d in distance.values() if not np.isnan(d)]
    avg_dist = np.mean(distance)
    #print('avg dist: ', avg_dist)
    sents1 = old_sent_data[w]
    sents2 = modern_sent_data[w]
    sents2 =['2.'+i.split('.')[1] for i in sents2]

    permutation_dist, permutation_pval = permutation_test(sents1, sents2, avg_dist,w)

    print(w, len(sents1), len(sents2), permutation_dist, permutation_pval)
    data[w] = (permutation_dist, permutation_pval)
    count += 1

calculating statistics ...
  - actitud (0 / 61)          
actitud 906 1860 0.007469774058661005 1e-05
  - ataque (1 / 61)          
ataque 627 2857 0.11245526773715697 1e-05
  - atrás (2 / 61)          


  dist = 1.0 - uv / np.sqrt(uu * vv)


atrás 1365 1324 0.0 1.0
  - ausencia (3 / 61)          
ausencia 660 935 0.002962396133941003 1e-05
  - avance (4 / 61)          
avance 86 3509 0.043536073863454296 1e-05
  - banco (5 / 61)          


In [5]:
import pandas as pd
df = pd.DataFrame(data, columns=['word', 'dist', 'pval'])

In [6]:
from statsmodels.stats.multitest import multipletests

pvalues = [float(d[2]) for d in data]

_,fdr,alpha,_ = multipletests(pvalues, method='fdr_bh')
fdr = fdr.tolist()
df['fdr'] = fdr
print(alpha, fdr)

0.0011144501557778241 [0.255875, 0.0053728, 1.0, 0.0031625, 1.0, 0.0, 0.27321212121212124, 1.0, 0.3246285714285714, 0.009907692307692308, 0.054518518518518515, 0.00012, 0.09528571428571428, 2.090909090909091e-05, 0.1426, 0.255875, 0.282764705882353, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.1426]


In [7]:
df['dist'] = df['dist'].astype('float')
df['pval'] = df['pval'].astype('float')

In [8]:
graded = {}
with open("graded_de_true.txt", encoding = 'utf-8') as f :
    f.readline()
    for line in f :
        line = line.strip()
        if not line : continue
        word, shift = line.split()
        graded[word] = float(shift)
arr = []
for w in df.word:
    arr.append(graded[w])
df['shift'] = arr

In [9]:
binary = {}
with open("binary_de_true.txt", encoding = 'utf-8') as f :
    f.readline()
    for line in f :
        line = line.strip()
        if not line : continue
        word, shift = line.split()
        binary[word] = float(shift)
arr = []
for w in df.word:
    arr.append(binary[w])
df['bin'] = arr

In [13]:
df['mix'] = df.apply(lambda x: x.dist*x.fdr, axis = 1)

In [71]:
df.sort_values(by='dist')

Unnamed: 0,word,dist,pval,fdr,shift,bin,mix
26,Pachtzins,0.0,1.0,1.0,0.0,0.0,1.0
7,ausspannen,0.0,1.0,1.0,0.70669,1.0,1.0
31,Sensation,0.00164,1.0,1.0,0.406144,1.0,0.99836
40,vergönnen,0.002014,1e-05,2.1e-05,0.071197,0.0,1.0
15,Frechheit,0.004143,0.131,0.192625,0.070839,0.0,0.999202
42,vorliegen,0.005621,0.974,1.0,0.190266,0.0,0.994379
25,Ohrwurm,0.008051,1e-05,2.1e-05,0.832451,1.0,1.0
38,Unentschlossenheit,0.008196,1.0,1.0,0.0,0.0,0.991804
34,Titel,0.008204,0.994,1.0,0.393045,0.0,0.991796
16,Fuß,0.012119,0.217,0.293588,0.564633,0.0,0.996442


In [None]:
df.corr(method="spearman")