# Word prediction in historical linguistics

## Prerequisites

### Application and data loading

In [None]:
from util import init
from dataset import data
from util.config import config
from prediction import prediction
import sys

lang_family_dict = {
"slav": ["ces", "bul", "rus", "bel", "ukr", "pol", "slk", "slv", "hrv"],
"ger": ["swe", "isl", "eng", "nld", "deu", "dan", "nor"]
}

# As user, you can either set separate languages or a language family
languages = ["nld","deu"]
lang_family = None


if lang_family:
    languages = lang_family_dict[lang_family]

options, distances_path, baselines_path = init.initialize_program()
results_path, output_path_cognates_train, output_path_cognates_valtest, context_vectors_path, lang_pairs, train, val, test, max_len, conversion_key, voc_size = data.load_data(train_corpus="northeuralex",
                                                                                               valtest_corpus="northeuralex",
                                                                                               languages=languages,  
                                                                                               input_type="asjp", 
                                                                                               options=options)




### Show number of cognates in training data
Show the number of cognate word pairs per language pair in the training data, and calculate cliques of languages with a minimum of 100 shared cognates. These cliques can later be used, to have a group of languages with a large shared number of cognates, to perform prediction on.

In [None]:
cog_per_lang, cliques = data.compute_n_cognates(lang_pairs, output_path_cognates_train, langs=languages, cognates_threshold=100)
print("Cognates per language: ")
print((cog_per_lang))
print("Cliques: ")
for c in cliques:
    print(c)

## Pairwise word prediction

### Word prediction using structured perceptron

In [None]:
for lang_pair in lang_pairs:
    print("Performing structured perceptron word prediction for pair (" + lang_a + ", " + lang_b + ")")
    prediction.word_prediction_seq(lang_a, lang_b, train[lang_pair], val[lang_pair], test[lang_pair], conversion_key[lang_pair], results_path[lang_pair], distances_path + ".txt", config)
        

### Word prediction using encoder-decoder

In [None]:
for lang_pair in lang_pairs:
    lang_a,lang_b = lang_pair
    print("Performing RNN word prediction for pair (" + lang_a + ", " + lang_b + ")")
    prediction.word_prediction(lang_a, lang_b, (max_len[lang_a], max_len[lang_b]), train[lang_pair], val[lang_pair], test[lang_pair], conversion_key[lang_pair], voc_size, results_path[lang_pair], distances_path + ".txt", context_vectors_path[lang_pair] + ".p", config["output_encoding"], config)


### Baseline for prediction

In [None]:
# Only ASJP
for lang_pair in lang_pairs:
    sounds = (list(features[0].index), list(features[1].index))
    training_frame = train[lang_pair].get_dataframe(conversion_key[lang_pair], config["input_encoding"], config["output_encoding"])
    testing_frame = test[lang_pair].get_dataframe(conversion_key[lang_pair], config["input_encoding"], config["output_encoding"])
    baseline.compute_baseline(lang_a, lang_b, sounds, training_frame, testing_frame, baselines_path + ".txt")

### Visualize encoding

In [None]:
from dataset import data
from visualize import visualize

for lang_pair in lang_pairs:
    # Create embedding every first language in language pair
    emb_matrix = data.create_embedding(lang_pair[0], [output_path_cognates_train, output_path_cognates_valtest])
    visualize.visualize_encoding(emb_matrix, feature_matrix_phon, lang_pair, config["results_dir"])

### Applications

#### Identify sound correspondences

In [None]:
from visualize import visualize

for lang_pair in lang_pairs:
    visualize.show_output_substitutions(results_path[lang_pair], subs_st_path[lang_pair], subs_sp_path[lang_pair])
    visualize.visualize_weights(context_vectors_path[lang_pair], lang_pair, config["input_encoding"], config["output_encoding"], config["results_dir"], sample=None)


#### Phylogenetic tree reconstruction

##### Clustering based on word prediction

In [None]:
from tree import cluster

# Cluster based on word prediction distances
print("WP TREE:\n")
cluster.cluster_languages(lang_pairs, distances_path, output_path=distances_path)

##### Baselines

In [None]:
from tree import cluster

# Source prediction baseline
print("\nSOURCE BASELINE TREE")
cluster.cluster_languages(lang_pairs, baselines_path, output_path=baselines_path + "_source", distance_col=2)
# PMI-based baseline
print("\nPMI BASELINE TREE")
cluster.cluster_languages(lang_pairs, baselines_path, output_path=baselines_path + "_pmi", distance_col=3)

#### Cognate detection

In [None]:
from cognatedetection import cd

### TODO: this part from previous code should not be executed:
# print("Filter val/test sets on cognates.")
# Use only cognate pairs for validation and test
# val[lang_pair] = val[lang_pair].filter_cognates()
# test[lang_pair] = test[lang_pair].filter_cognates()
# print("Val/test sizes after cognate filtering: " + str(val[lang_pair].get_size()) + "|" + str(test[lang_pair].get_size()))


print("Performing WP cognate detection using clustering...")
results_table = cd.cognate_detection_cluster(lang_pairs, config["results_dir"], options, use_distance="prediction")
print(results_table)

## Phylogenetic word prediction (experimental)

In [None]:
from util import utility
from prediction import prediction


# In phylogenetic mode, we created one feature matrix for all languages
for lang_pair in lang_pairs:
    conversion_key[lang_pair] = conversion_key_general

voc_size = voc_size_general
# For phylogenetic word prediction, create one feature matrix for all languages
print("Create feature matrix for all language pairs.")
used_tokens = [[], []]
tokens_set = [[], []]
for lang_pair in lang_pairs:
    # For phylogenetic word prediction, create one feature matrix for all languages
    features_lp[lang_pair], max_len[lang_pair[0]], max_len[lang_pair[1]], _, _ = data.get_corpus_info([tsv_cognates_path_train + ".tsv", tsv_cognates_path_valtest + ".tsv"], lang_pair=lang_pair, input_encoding=config["input_encoding"], output_encoding=config["output_encoding"], feature_matrix_phon=feature_matrix_phon)
    used_tokens[0] += list(features_lp[lang_pair][0].index)
    used_tokens[1] += list(features_lp[lang_pair][1].index)

tokens_set[0] = list(set(used_tokens[0]))
tokens_set[1] = list(set(used_tokens[1]))
if config["input_encoding"] == "character":
    features[0] = data.create_one_hot_matrix(tokens_set[0])
elif config["input_encoding"] == "phonetic":
    features[0] = feature_matrix_phon.loc[tokens_set[0]]
else:
    print("Embedding encoding not possible in phylogenetic tree prediction.")
    return
# Output encoding is always character
features[1] = data.create_one_hot_matrix(tokens_set[1])
voc_size_general[0] = features[0].shape[1]
voc_size_general[1] = features[1].shape[1]
conversion_key_general = data.create_conversion_key(features)
plot_path_phyl = utility.create_path(config["results_dir"], options, prefix="plot_")

config["export_weights"] = False  # Turn off export of weights
tree_string = "((nld,deu),eng)"  # unused at the moment
if len(config["languages"]) >= 3:
    results_path_proto = utility.create_path(config["results_dir"], options, prefix="proto_")  # lang-pair independent path
    prediction.word_prediction_phyl(config["languages"], lang_pairs, tree_string, max_len, train, val, test, conversion_key_general, voc_size, results_path, results_path_proto, distances_path + ".txt", context_vectors_path, plot_path_phyl, config["output_encoding"], config)
else:
    print("Please supply 3 languages, the first 2 being more closely related than the last.")