# Word prediction in historical linguistics

## Prerequisites

### Application and data loading

In [1]:
from util import init
from dataset import data
from util.config import config
import pandas as pd
import sys

lang_family_dict = {
"slav": ["ces", "bul", "rus", "bel", "ukr", "pol", "slk", "slv", "hrv"],
"ger": ["swe", "isl", "eng", "nld", "deu", "dan", "nor"]
}

# As user, you can either set separate languages or a language family
languages = ["nld","deu"]
lang_family = None


if lang_family:
    languages = lang_family_dict[lang_family]

options, distances_path, baselines_path = init.initialize_program()
(results_path, output_path_cognates_train, output_path_cognates_valtest,
context_vectors_path, subs_sp_path, subs_st_path, lang_pairs, train, val, test, max_len, 
conversion_key, voc_size, feature_matrix_phon) = data.load_data(train_corpus="northeuralex",
                                                               valtest_corpus="northeuralex",
                                                               languages=languages,  
                                                               input_type="asjp", 
                                                               options=options)

2020-04-06 19:23:55,912 [INFO] Successfully changed parameters.


Initializing program...
Loading phonetic feature matrix...
Generating all language pairs...
Training corpus:
 - Loading dataset and performing necessary conversion/tokenization.
Using existing wordlist file, nothing is generated.
 - Detect cognates in entire dataset using LexStat.
Using existing cognates file output/northeuralex-asjp-cognates.tsv, nothing is generated.
Train corpus is valtest corpus.
Creating feature matrix for this specific language pair...
Converting training corpus TSV file to data matrix...
Converting val/test corpus TSV file to data matrix...
USE TRAIN M/S
Dividing into training, validation and test set...
Train/val/test sizes: 711|0|0
Train/val/test sizes: 0|236|236
Filtering val/test sets on cognates...
Val/test sizes after cognate filtering: 159|139
Done loading data.


### Visualize encoding
Visualize the representation of phonemes in the embedding encoding, as PCA and as hierarchically clustered tree. Compare them to the phonetic feature matrix from Brown (2008).

In [None]:
from dataset import data
from visualize import visualize
from tree import cluster
from util.config import config

tree_style = config["ete_tree_style"]

print("Phonetic matrix from Brown (2008):")
# Perform PCA on phonetic feature matrix from Brown (2008)
phon_matrix_red, phon_phonemes = visualize.dim_reduction(feature_matrix_phon)
# Visualize phonetic feature PCA using plot
visualize.visualize_encoding(phon_matrix_red, phon_phonemes, "phonetic-pca")
# Hierarchically cluster distances between phonemes in phonetic feature matrix
tree = cluster.cluster_phonemes_encoding(feature_matrix_phon, phon_phonemes, "phonetic")

display(tree.render("%%inline", tree_style=tree_style))

for lang_pair in lang_pairs:
    lang_a = lang_pair[0]
    print(f"Embedding for {lang_a}:")
    # Create embedding for every first language in language pair
    emb_matrix = data.create_embedding(lang_a, [output_path_cognates_train, output_path_cognates_valtest])
    # Perform PCA on embedding matrix
    emb_matrix_red, emb_phonemes = visualize.dim_reduction(emb_matrix)
    # Visualize embedding PCA using plot
    visualize.visualize_encoding(emb_matrix_red, emb_phonemes, f"embedding-{lang_pair[0]}-pca")
    
    # Hierarchically cluster distances between phonemes in embedding matrix
    tree = cluster.cluster_phonemes_encoding(emb_matrix, emb_phonemes, f"embedding-{lang_pair[0]}")
    display(tree.render("%%inline", tree_style=tree_style))
    

#### Close-up: constrast occurrences of phonemes in data
Interesting patterns in the phoneme encoding visualizations, can be looked up in the data. In the Dutch embedding encoding, we saw that *t* and *d*, closely related phonemes, are quite remote in the embedding space. How do the words with *t* and *d*, on which the embedding encoding is based, look in Dutch?

In [None]:

lang = "nld"
# Read in TSV file with data
df = pd.read_csv(output_path_cognates_train, sep="\t", engine="python", skipfooter=3, index_col=False)
df_lang = df[df["DOCULECT"] == lang]

for phoneme in ["d", "t"]:
    print(phoneme)
    words_with_phoneme = df_lang[df_lang["ASJP"].str.contains(phoneme)]
    total = len(words_with_phoneme)
    print(f"Total number of occurrences: {total}")
    
    # Compute locations of phonemes in word
    locations = df_lang["ASJP"].str.find(phoneme)
    locations = locations[locations != -1]
    # Compute relative frequencies
    locations_relfreq = locations.value_counts(normalize=True)
    print("Relative frequencies of locations:")
    print(locations_relfreq)
    # Look up words with most frequent location
    most_freq_location = int(locations.mode())
    words_in_most_freq_loc = words_with_phoneme[words_with_phoneme["ASJP"].str.find(phoneme) == most_freq_location]
    print(f"Words with {phoneme} where {phoneme} has most frequent location in word ({most_freq_location}):")
    print(words_in_most_freq_loc)
    print("")
    
    
#words_list = list(df_lang["TOKENS"])




### Show number of cognates in training data
Show the number of cognate word pairs per language pair in the training data, and calculate cliques of languages with a minimum of 100 shared cognates. These cliques can later be used, to have a group of languages with a large shared number of cognates, to perform prediction on.

In [None]:
cog_per_lang, cliques = data.compute_n_cognates(lang_pairs, output_path_cognates_train, langs=languages, cognates_threshold=100)
print("Cognates per language: ")
print((cog_per_lang))
print("Cliques: ")
for c in cliques:
    print(c)

## Pairwise word prediction

### Word prediction using structured perceptron

In [None]:
from prediction import prediction

for lang_pair in lang_pairs:
    lang_a, lang_b = lang_pair
    print("Performing structured perceptron word prediction for pair (" + lang_a + ", " + lang_b + ")")
    prediction.word_prediction_seq(lang_a, lang_b, train[lang_pair], val[lang_pair], test[lang_pair], conversion_key[lang_pair], results_path[lang_pair], distances_path + ".txt", config)
        

### Word prediction using encoder-decoder

In [2]:
from prediction import prediction

for lang_pair in lang_pairs:
    lang_a,lang_b = lang_pair
    print("Performing RNN word prediction for pair (" + lang_a + ", " + lang_b + ")")
    prediction.word_prediction(lang_a, lang_b, (max_len[lang_a], max_len[lang_b]), train[lang_pair], val[lang_pair], test[lang_pair], conversion_key[lang_pair], voc_size, results_path[lang_pair], distances_path + ".txt", context_vectors_path[lang_pair] + ".p", config["output_encoding"], config)


Performing RNN word prediction for pair (nld, deu)
Create RNN instance.
Building network ...
Creating loss function...
Computing updates ...
All params:
[enc_bw.W_in_to_updategate, enc_bw.W_hid_to_updategate, enc_bw.b_updategate, enc_bw.W_in_to_resetgate, enc_bw.W_hid_to_resetgate, enc_bw.b_resetgate, enc_bw.W_in_to_hidden_update, enc_bw.W_hid_to_hidden_update, enc_bw.b_hidden_update, enc_bw.hid_init, enc_fw.W_in_to_updategate, enc_fw.W_hid_to_updategate, enc_fw.b_updategate, enc_fw.W_in_to_resetgate, enc_fw.W_hid_to_resetgate, enc_fw.b_resetgate, enc_fw.W_in_to_hidden_update, enc_fw.W_hid_to_hidden_update, enc_fw.b_hidden_update, enc_fw.hid_init, dense_comb.W, dense_comb.b, dec.W_in_to_updategate, dec.W_hid_to_updategate, dec.b_updategate, dec.W_in_to_resetgate, dec.W_hid_to_resetgate, dec.b_resetgate, dec.W_in_to_hidden_update, dec.W_hid_to_hidden_update, dec.b_hidden_update, W, b]
Compiling functions ...
Training ...
0.9992592339914663 3.149856669439072 10.356941389421538 6.58824504

0.993688364451468 5.297912590987167 10.356941389421538 6.5882450499154555
0.003125619250381583 16.1219337305236 10.356941389421538 6.5882450499154555
0.9979534496128472 4.167390395654662 10.356941389421538 6.5882450499154555
0.9992602526572159 3.1484795517761075 10.356941389421538 6.5882450499154555
0.9936056969046846 5.311008397256621 10.356941389421538 6.5882450499154555
0.9998385807126424 1.6255975128264954 10.356941389421538 6.5882450499154555
0.9854812819981575 6.139249970029935 10.356941389421538 6.5882450499154555
0.8122365705665314 8.89233256499013 10.356941389421538 6.5882450499154555
0.9995515286460105 2.6477242121602798 10.356941389421538 6.5882450499154555
0.9927041500746578 5.443814388358862 10.356941389421538 6.5882450499154555
0.9979298869682636 4.178861579606862 10.356941389421538 6.5882450499154555
0.033696514573546534 13.713024934509777 10.356941389421538 6.5882450499154555
0.09440637010577138 12.617923511909513 10.356941389421538 6.5882450499154555
0.9987635849972077

0.9988008813673973 3.536378513165234 10.26134713989127 6.2841755817026455
0.9985739138725229 3.709952682983713 10.26134713989127 6.2841755817026455
0.9847309175272009 6.094808746724785 10.26134713989127 6.2841755817026455
0.9995359195487424 2.586358693624391 10.26134713989127 6.2841755817026455
0.9705492835559604 6.766203194231308 10.26134713989127 6.2841755817026455
0.9990644592841917 3.2878972283459 10.26134713989127 6.2841755817026455
0.9993311749880827 2.9520280899750406 10.26134713989127 6.2841755817026455
0.0062934103781645415 15.32328600723885 10.26134713989127 6.2841755817026455
0.07136653218812058 12.827232233384596 10.26134713989127 6.2841755817026455
0.006386605207757082 15.308492471224955 10.26134713989127 6.2841755817026455
0.9994652752184647 2.7281236372210627 10.26134713989127 6.2841755817026455
0.9974519997757404 4.291451938534551 10.26134713989127 6.2841755817026455
0.9992291122602787 3.0941505264411178 10.26134713989127 6.2841755817026455
0.999334515397405 2.947017789

### Baseline for prediction

In [None]:
from models import baseline

# Only ASJP
for lang_pair in lang_pairs:
    lang_a,lang_b = lang_pair
    conv = conversion_key[lang_pair]
    sounds = (list(conv[0].values()), list(conv[1].values()))
    training_frame = train[lang_pair].get_dataframe(conversion_key[lang_pair], config["input_encoding"], config["output_encoding"])
    testing_frame = test[lang_pair].get_dataframe(conversion_key[lang_pair], config["input_encoding"], config["output_encoding"])
    baseline.compute_baseline(lang_a, lang_b, sounds, training_frame, testing_frame, baselines_path + ".txt")

### Applications

#### Identify sound correspondences
##### Based on output substitutions
This shows the substitutions table in the notebook, and outputs it as LaTeX table to `RESULTS_DIR/subs.tex`.

In [None]:
from visualize import visualize

for lang_pair in lang_pairs:
    visualize.show_output_substitutions(results_path[lang_pair], subs_st_path[lang_pair], subs_sp_path[lang_pair])

##### Based on context vector weights
This works when word prediction with encoder-decoder has been run, which exports context vector weights.

In [3]:
from visualize import visualize

for lang_pair in lang_pairs:
    visualize.visualize_weights(context_vectors_path[lang_pair], lang_pair, config["input_encoding"], config["output_encoding"], config["results_dir"], sample=None)


vectors(139, 1, 400)
input_raw(139, 1, 14, 28)
target_raw(139, 1, 14, 29)
After flatten
vectors(139, 400)
input_raw(139, 392)
target_raw(139, 406)
Number of words: 139
Affinity propagation-0.2
vectors_dist
 tant3 nakt nat tal last
 til3 stErkt3 sam3 sx3lt ski spul3 strom3 stot3 slik3 spel3 sprek3
 vErkiz3 rex3 rEn3 xev3 lex rur3 brad3
 kid3 morx3 bind3l mid3 Eiz3r mEis morx3 drom
 krEis sid3r3 krEip3 b3tal3 Eil3 r3is3
 brur vex3 buzEm vremt vrau uv3r vuts3l vur3 tuvux3 vex3 vol rup3 xrun
 vErxan drain ovErwin3 fert3x hoNerix b3drix3 rEip
 klim3 tek3 lEid3 kis3 kok3 blaz3 tudEk3 hErkEn3
 link3r blint lerar lint
 won3 wExan
 brand3 wrEiv3 t3r3x b3hers3 bEix3 x3borEnword3 vErbet3r3 b3xin3 vErxet3 vErbrand3 reparer3 vErbind3
 zikzEin zixvErzam3l3 yini sxEin3 afsnEid3 anzin zwErm
 hElft xEf hErfst
 zond3 zid3 x3zin z3
 hoN3r hart ton handuk hondErt
 zom3r mar mart arm
 drEk der dri dir
 hals hEmt hev3l hem3l hEld3r
 pat part
 x3wixt x3lEit x3wer inwik3l3 aNelExEnhEit
 spits inslap3 stan sto

NameError: name 'cosine' is not defined

#### Phylogenetic tree reconstruction

##### Clustering based on word prediction

In [None]:
from tree import cluster

# Cluster based on word prediction distances
print("WP TREE:\n")
cluster.cluster_languages(lang_pairs, distances_path, output_path=distances_path)

##### Clustering based on baseline

In [None]:
from tree import cluster

# Source prediction baseline
print("\nSOURCE BASELINE TREE")
cluster.cluster_languages(lang_pairs, baselines_path, output_path=baselines_path + "_source", distance_col=2)
# PMI-based baseline
print("\nPMI BASELINE TREE")
cluster.cluster_languages(lang_pairs, baselines_path, output_path=baselines_path + "_pmi", distance_col=3)

##### Draw tree from existing newick string (no distance calculcation)

In [None]:
####Improvised code to re-generate trees
from ete3 import Tree, TreeStyle, NodeStyle, TextFace
from scipy.spatial.distance import pdist, squareform
from util.config import config
newick_string1 = "((bul:0.15,(slv:0.11,hrv:0.11):0.04):0.04,((rus:0.11,(bel:0.1,ukr:0.1):0.01):0.07,(pol:0.15,(ces:0.08,slk:0.08):0.07):0.03):0.01);"
newick_string2 = "(((bel:0.08,ukr:0.12):0.01,rus:0.12):0.03,(((slv:0.1,hrv:0.11):0.01,bul:0.18):0.07,(pol:0.17,(ces:0.1,slk:0.07):0.05):0.04):0.03);"
newick_string3 = "((bul:0.29,((ces:0.24,slk:0.24):0.04,(slv:0.25,hrv:0.25):0.03):0.01):0.01,(pol:0.28,(rus:0.24,(bel:0.22,ukr:0.22):0.02):0.04):0.01);"
newick_string4 = "((bel,rus,ukr),((hrv,slv),bul),((ces,slk),pol));"

ts = TreeStyle()
ts.show_scale = False
ts.show_leaf_name = False
ts.force_topology = False
ts.show_border = False
ts.margin_top = ts.margin_bottom = ts.margin_right = ts.margin_left = 5
ts.scale = 500
ts.branch_vertical_margin= 10


for i,newick_string in enumerate([newick_string1, newick_string2, newick_string3, newick_string4]):
    if i==3: # last newick string without lengths, should be corrected
        ts = TreeStyle()
        ts.show_scale = False
        ts.show_leaf_name = False
        ts.force_topology = False
        ts.show_border = False
        ts.margin_top = ts.margin_bottom = ts.margin_right = ts.margin_left = 5
        ts.scale = 50
        ts.branch_vertical_margin= 10
    # Load newick string into ete3 Tree object
    tree = Tree(newick_string)
    for node in tree.traverse():
        node.set_style(config["ete_node_style"])
        if node.is_leaf():
            # Add bit of extra space between leaf branch and leaf label
            name_face = TextFace(f" {node.name}", fgcolor="black", fsize=10)
            node.add_face(name_face, column=0, position='branch-right')
    print(f"output/tree{i+1}.pdf")
    tree.render(f"output/tree{i+1}.pdf", tree_style=ts)
    display(tree.render(f"%%inline", tree_style=ts))

#### Cognate detection

In [None]:
from cognatedetection import cd

### TODO: this part from previous code should not be executed:
# print("Filter val/test sets on cognates.")
# Use only cognate pairs for validation and test
# val[lang_pair] = val[lang_pair].filter_cognates()
# test[lang_pair] = test[lang_pair].filter_cognates()
# print("Val/test sizes after cognate filtering: " + str(val[lang_pair].get_size()) + "|" + str(test[lang_pair].get_size()))


print("Performing WP cognate detection using clustering...")
results_table = cd.cognate_detection_cluster(lang_pairs, config["results_dir"], options, use_distance="prediction")
print(results_table)

## Phylogenetic word prediction (experimental)

In [None]:
from util import utility
from prediction import prediction


# In phylogenetic mode, we created one feature matrix for all languages
for lang_pair in lang_pairs:
    conversion_key[lang_pair] = conversion_key_general

voc_size = voc_size_general
# For phylogenetic word prediction, create one feature matrix for all languages
print("Create feature matrix for all language pairs.")
used_tokens = [[], []]
tokens_set = [[], []]
for lang_pair in lang_pairs:
    # For phylogenetic word prediction, create one feature matrix for all languages
    features_lp[lang_pair], max_len[lang_pair[0]], max_len[lang_pair[1]], _, _ = data.get_corpus_info([tsv_cognates_path_train + ".tsv", tsv_cognates_path_valtest + ".tsv"], lang_pair=lang_pair, input_encoding=config["input_encoding"], output_encoding=config["output_encoding"], feature_matrix_phon=feature_matrix_phon)
    used_tokens[0] += list(features_lp[lang_pair][0].index)
    used_tokens[1] += list(features_lp[lang_pair][1].index)

tokens_set[0] = list(set(used_tokens[0]))
tokens_set[1] = list(set(used_tokens[1]))
if config["input_encoding"] == "character":
    features[0] = data.create_one_hot_matrix(tokens_set[0])
elif config["input_encoding"] == "phonetic":
    features[0] = feature_matrix_phon.loc[tokens_set[0]]
else:
    print("Embedding encoding not possible in phylogenetic tree prediction.")
    return
# Output encoding is always character
features[1] = data.create_one_hot_matrix(tokens_set[1])
voc_size_general[0] = features[0].shape[1]
voc_size_general[1] = features[1].shape[1]
conversion_key_general = data.create_conversion_key(features)
plot_path_phyl = utility.create_path(config["results_dir"], options, prefix="plot_")

config["export_weights"] = False  # Turn off export of weights
tree_string = "((nld,deu),eng)"  # unused at the moment
if len(config["languages"]) >= 3:
    results_path_proto = utility.create_path(config["results_dir"], options, prefix="proto_")  # lang-pair independent path
    prediction.word_prediction_phyl(config["languages"], lang_pairs, tree_string, max_len, train, val, test, conversion_key_general, voc_size, results_path, results_path_proto, distances_path + ".txt", context_vectors_path, plot_path_phyl, config["output_encoding"], config)
else:
    print("Please supply 3 languages, the first 2 being more closely related than the last.")