# Word prediction in historical linguistics

## Prerequisites

### Application and data loading

In [5]:
from util import init
from dataset import data
from util.config import config

options = init.initialize_program()
intersection_path, distances_path, baselines_path, lang_pairs = data.load_data(config["train_corpus"])

2019-08-01 16:05:47,501 [INFO] Successfully changed parameters.


NameError: name 'config' is not defined

### Show number of cognates

In [None]:
print("Show number of cognates per language")
cog_per_lang, cliques = data.compute_n_cognates(lang_pairs, tsv_cognates_path_train, langs=config["languages"], cognates_threshold=100)
print("Cognates per language: " + str(cog_per_lang))
print("Number of cliques: " + str(cliques))

## Pairwise word prediction

### Perform word prediction algorithm

In [None]:
for lang_pair in lang_pairs:
    lang_a, lang_b = lang_pair
    context_vectors_path[lang_pair] = utility.create_path(config["results_dir"], options, prefix="context_vectors_", lang_a=lang_a, lang_b=lang_b)
    # Create export path, containing all options
    # This is used to output a prediction results file, which can then be used for visualization and cognate detection
    results_path[lang_pair] = utility.get_results_path(lang_a, lang_b, config["results_dir"], options)
    subs_st_path[lang_pair] = utility.create_path(config["results_dir"], options, prefix="subs_st_", lang_a=lang_a, lang_b=lang_b)
    subs_sp_path[lang_pair] = utility.create_path(config["results_dir"], options, prefix="subs_sp_", lang_a=lang_a, lang_b=lang_b)

    if config["prediction"] or config["baseline"]:
        # If data in pickle, load pickle
        data_pickle = results_path[lang_pair] + "-data.p"
        if os.path.exists(data_pickle):
            with open(data_pickle, "rb") as f:
                print("Loading train/val/test sets from pickle, nothing generated.")
                train[lang_pair], val[lang_pair], test[lang_pair], conversion_key[lang_pair], max_len[lang_pair[0]], max_len[lang_pair[1]], voc_size[0], voc_size[1] = pickle.load(f)
        else:
            # For phylogenetic word prediction, we have a language-independent feature matrix
            if not config["phyl"]:
                print("Create feature matrix for this specific language pair.")
                features, max_len[lang_pair[0]], max_len[lang_pair[1]], voc_size[0], voc_size[1] = data.get_corpus_info([tsv_cognates_path_train + ".tsv", tsv_cognates_path_valtest + ".tsv"], lang_pair=lang_pair, input_encoding=config["input_encoding"], output_encoding=config["output_encoding"], feature_matrix_phon=feature_matrix_phon)
                conversion_key[lang_pair] = data.create_conversion_key(features)
            else:
                # In phylogenetic mode, we created one feature matrix for all languages
                conversion_key[lang_pair] = conversion_key_general
                voc_size = voc_size_general

            print("Convert training corpus TSV file to data matrix")
            dataset_train, train_mean, train_std = data.create_data_matrix(tsv_path=tsv_cognates_path_train + ".tsv", lang_pair=(lang_a, lang_b), features=features, max_len=(max_len[lang_pair[0]], max_len[lang_pair[1]]), voc_size=voc_size, batch_size=config["batch_size"], mean_subtraction=config["mean_subtraction"], feature_standardization=not config["no_standardization"], excluded_concepts=excluded_concepts_training, cognate_detection=config["cognate_detection"])

            print("Convert val/test corpus TSV file to data matrix")
            dataset_valtest, _, _ = data.create_data_matrix(tsv_path=tsv_cognates_path_valtest + ".tsv", lang_pair=(lang_a, lang_b), features=features, max_len=(max_len[lang_pair[0]], max_len[lang_pair[1]]), voc_size=voc_size, batch_size=config["batch_size"], mean_subtraction=config["mean_subtraction"], feature_standardization=not config["no_standardization"], cognate_detection=config["cognate_detection"], valtest=True, train_mean=train_mean, train_std=train_std)

            t_set_size = dataset_train.get_size()
            vt_set_size = dataset_valtest.get_size()

            if config["valtest_corpus"] == config["train_corpus"]:
                # If train and valtest corpus the same, divide one corpus into parts
                assert t_set_size == vt_set_size
                n_train, n_val, n_test = dataset_train.compute_subset_sizes(t_set_size)
            else:
                # If train and valtest corpus different, use full train corpus as train and
                # full valtest corpus for validation and testing
                # TODO: In fact this is not needed, we can directly take set size.
                n_train, _, _ = dataset_train.compute_subset_sizes(t_set_size, only_train=True)
                _, n_val, n_test = dataset_valtest.compute_subset_sizes(vt_set_size, only_valtest=True)

            print("Divide into training, validation and test set.")
            # Even if train and valtest corpus are the same, we do this separately,
            # because valtest corpus is filtered on cognates and train corpus is not
            # Use train corpus only for train set
            train[lang_pair], _, _ = dataset_train.divide_subsets(n_train, 0, 0)
            # Use val/test corpus for validation and test set
            _, val[lang_pair], test[lang_pair] = dataset_valtest.divide_subsets(0, n_val, n_test)

            if not config["cognate_detection"]:
                print("Filter val/test sets on cognates.")
                # Use only cognate pairs for validation and test
                val[lang_pair] = val[lang_pair].filter_cognates()
                test[lang_pair] = test[lang_pair].filter_cognates()
                print("Val/test sizes after cognate filtering: " + str(val[lang_pair].get_size()) + "|" + str(test[lang_pair].get_size()))

            # Pickle train/val/test/sets
            with open(data_pickle, "wb") as f:
                pickle.dump((train[lang_pair], val[lang_pair], test[lang_pair], conversion_key[lang_pair], max_len[lang_pair[0]], max_len[lang_pair[1]], voc_size[0], voc_size[1]), f)

    if config["prediction"] and not config["seq"] and not config["phyl"]:
        print("Performing word prediction for pair (" + lang_a + ", " + lang_b + ")")
        prediction.word_prediction(lang_a, lang_b, (max_len[lang_pair[0]], max_len[lang_pair[1]]), train[lang_pair], val[lang_pair], test[lang_pair], conversion_key[lang_pair], voc_size, results_path[lang_pair], distances_path + ".txt", context_vectors_path[lang_pair] + ".p", config["output_encoding"], config)
    if config["prediction"] and config["seq"] and not config["phyl"]:
        print("Performing SeqModel word prediction for pair (" + lang_a + ", " + lang_b + ")")
        prediction.word_prediction_seq(lang_a, lang_b, train[lang_pair], val[lang_pair], test[lang_pair], conversion_key[lang_pair], results_path[lang_pair], distances_path + ".txt", config)
    if config["baseline"] and config["input_type"] == "asjp":
        print("Performing baseline results for pair(" + lang_a + ", " + lang_b + ")")
        sounds = (list(features[0].index), list(features[1].index))
        training_frame = train[lang_pair].get_dataframe(conversion_key[lang_pair], config["input_encoding"], config["output_encoding"])
        testing_frame = test[lang_pair].get_dataframe(conversion_key[lang_pair], config["input_encoding"], config["output_encoding"])
        baseline.compute_baseline(lang_a, lang_b, sounds, training_frame, testing_frame, baselines_path + ".txt")
    if config["visualize"]:
        print("Inferring sound correspondences...")
        visualize.show_output_substitutions(results_path[lang_pair], subs_st_path[lang_pair], subs_sp_path[lang_pair])
    if config["visualize_weights"]:
        visualize.visualize_weights(context_vectors_path[lang_pair], lang_pair, config["input_encoding"], config["output_encoding"], config["results_dir"], sample=None)


### Applications

#### Visualize encoding

In [3]:
from dataset import data
from visualize import visualize

for lang_pair in lang_pairs:
    # Create embedding for first languages
    emb_matrix = data.create_embedding(lang_pair[0], [tsv_cognates_path_train + ".tsv", tsv_cognates_path_valtest + ".tsv"])
    visualize.visualize_encoding(emb_matrix, feature_matrix_phon, lang_pair, config["results_dir"])

NameError: name 'lang_pairs' is not defined

#### Phylogenetic tree reconstruction

##### Clustering based on word prediction

In [None]:
from tree import cluster

# Cluster based on word prediction distances
print("WP TREE:\n")
cluster.cluster_languages(lang_pairs, distances_path, output_path=distances_path)

##### Baselines

In [None]:
from tree import cluster

# Source prediction baseline
print("\nSOURCE BASELINE TREE")
cluster.cluster_languages(lang_pairs, baselines_path, output_path=baselines_path + "_source", distance_col=2)
# PMI-based baseline
print("\nPMI BASELINE TREE")
cluster.cluster_languages(lang_pairs, baselines_path, output_path=baselines_path + "_pmi", distance_col=3)

#### Cognate detection

In [2]:
from cognatedetection import cd

print("Performing WP cognate detection using clustering...")
results_table = cd.cognate_detection_cluster(lang_pairs, config["results_dir"], options, use_distance="prediction")
print(results_table)

Performing WP cognate detection using clustering...


NameError: name 'cd' is not defined

## Phylogenetic word prediction (experimental)

In [None]:
from util import utility
from prediction import prediction

config["export_weights"] = False  # Turn off export of weights
tree_string = "((nld,deu),eng)"  # unused at the moment
if len(config["languages"]) >= 3:
    results_path_proto = utility.create_path(config["results_dir"], options, prefix="proto_")  # lang-pair independent path
    prediction.word_prediction_phyl(config["languages"], lang_pairs, tree_string, max_len, train, val, test, conversion_key_general, voc_size, results_path, results_path_proto, distances_path + ".txt", context_vectors_path, plot_path_phyl, config["output_encoding"], config)
else:
    print("Please supply 3 languages, the first 2 being more closely related than the last.")