## Load data

In [None]:
import data

forms_df,cognates_df,lects_df = data.load_romance_dataset()
display(forms_df)

len(forms_df["Language_ID"].unique())


## Filter data

In [None]:
forms_df_1cognate = data.filter_romance_empty_multicog(forms_df)
display(forms_df_1cognate)

## Filter on Latin inflection classes

In [None]:
latin_conjugation_df = data.filter_romance_inflections(forms_df_1cognate, cognates_df)
print(len(latin_conjugation_df))
grouped_by_lang = latin_conjugation_df.groupby("Language_ID").count().sort_values(by="ID_x", ascending=False).head(10)
display(grouped_by_lang)
#grouped_by_lang["Form"].to_csv("langs.tex", sep="&", line_terminator = "\\\\\n")


## Show type and token frequency of inflection classes for French

In [None]:
import pandas as pd
french_df = latin_conjugation_df[latin_conjugation_df["Language_ID"]=="French_Modern_Standard"]
print("Token count")
display(french_df.groupby("Latin_Conjugation").count()["Form"])

print("Type count")
display(french_df.drop_duplicates(subset="Cognateset_ID_first").groupby("Latin_Conjugation").count()["Form"])

## Create dataset per language

In [None]:
import evaluation
import numpy as np

forms_onehot, inflections_onehot, forms, inflections, cogids = data.create_language_dataset(latin_conjugation_df, "French_Modern_Standard", empty_symbol=True)
forms_inflections_onehot = np.concatenate((forms_onehot, inflections_onehot), axis=1)

inflections_empty = np.zeros(inflections_onehot.shape)
forms_empty_inflections_onehot = np.concatenate((forms_onehot, inflections_empty), axis=1)



## Plot forms without inflection class

In [None]:
score = evaluation.plot_data(forms_empty_inflections_onehot, labels=None, clusters=inflections, micro_clusters=cogids, file_label="data")
print (f"Silhouette score: {score}")

## Plot forms with inflection class

In [None]:
score = evaluation.plot_data(forms_inflections_onehot, labels=None, clusters=inflections, micro_clusters=cogids, file_label="data-inflections")
print (f"Silhouette score: {score}")

## ART 1 run

In [None]:
from conf import ART_VIGILANCE, ART_LEARNING_RATE, INFLECTION_CLASSES, N_INFLECTION_CLASSES
from neupy.algorithms import ART1
from sklearn.metrics import silhouette_score, rand_score, adjusted_rand_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import data



def art_one(data_onehot, inflections_gold, cogids, vigilances=[ART_VIGILANCE], data_plot=False):
    art_iterated(data_onehot, n_runs=1, n_timesteps=1, batch_size=None, inflections_gold=inflections_gold, cogids=cogids, vigilances=vigilances, iterated=False, data_plot=data_plot)

def art_iterated(data_onehot, n_runs, n_timesteps, batch_size, inflections_gold, cogids, vigilances=[ART_VIGILANCE], iterated=True, data_plot=False):
    inflections_gold = np.array(inflections_gold)
    if cogids is not None:
        cogids = np.array(cogids)
    records_end_scores = []
    records_end_clusters = []
    records_course_scores = []
    records_course_clusters = []
    for vig in vigilances:
        print(f" - Vigilance: {vig}")
        for r in range(n_runs):
            print(f" -- Run: {r}")
            # Initialize original data for new run
            input_next_gen = data_onehot.copy()
            for i in range(n_timesteps):
                if iterated:
                    batch = np.random.choice(len(input_next_gen), batch_size, replace=False)
                else:
                    # Use whole dataset
                    batch = np.arange(len(input_next_gen))

                artnet = ART1(
                    step=ART_LEARNING_RATE,
                    rho=vig,
                    n_clusters=N_INFLECTION_CLASSES,
                )
                clusters_art = artnet.predict(input_next_gen[batch])

                # Calculate scores
                silhouette = silhouette_score(X=data_onehot[batch], labels=clusters_art, metric="hamming")
                rand = rand_score(inflections_gold[batch], clusters_art)
                adj_rand = adjusted_rand_score(inflections_gold[batch], clusters_art)
                cluster_sizes = np.bincount(np.array(clusters_art,dtype=int))
                min_cluster_size = np.min(cluster_sizes)
                max_cluster_size = np.max(cluster_sizes)

                if iterated:
                    # Transfer information to next generation
                    clusters_art_onehot, _ = data.create_onehot_inflections(clusters_art)
                    # Replace last columns, represeting inflection class, by one-hot vector of inferred inflection classes
                    input_next_gen[batch,-N_INFLECTION_CLASSES:] = clusters_art_onehot
                    records_course_scores.append({"run": r, "timestep": i, "vigilance": vig, "metric": "silhouette", "score": silhouette})
                    #records_course_scores.append({"run": r, "timestep": i, "vigilance": vig, "metric": "rand", "score": rand})
                    records_course_scores.append({"run": r, "timestep": i, "vigilance": vig, "metric": "adj_rand", "score": adj_rand})
                    records_course_clusters.append({"run": r, "timestep": i, "vigilance": vig, "metric": "min_cluster_size", "n_forms": min_cluster_size})
                    records_course_clusters.append({"run": r, "timestep": i, "vigilance": vig, "metric": "max_cluster_size", "n_forms": max_cluster_size})
            if data_plot:
                evaluation.plot_data(data_onehot[batch], labels=None, clusters=clusters_art, micro_clusters=cogids[batch], file_label=f"data-art-{n_timesteps}-end-vig{vig}")
            if not iterated:
                records_end_scores.append({"vigilance": vig, "metric": "silhouette", "score": silhouette})
                records_end_scores.append({"vigilance": vig, "metric": "rand", "score": rand})
                records_end_scores.append({"vigilance": vig, "metric": "adj_rand", "score": adj_rand})
                records_end_clusters.append({"vigilance": vig, "metric": "min_cluster_size", "n_forms": min_cluster_size})
                records_end_clusters.append({"vigilance": vig, "metric": "max_cluster_size", "n_forms": max_cluster_size})
    # Plot results
    if not iterated:
        df_end_scores = pd.DataFrame.from_records(records_end_scores)
        df_end_scores.pivot(index="vigilance", columns="metric", values="score").to_csv("scores-art-end.tex", sep="&", line_terminator = "\\\\\n")
        sns.lineplot(data=df_end_scores, x="vigilance", y = "score", hue="metric")
        plt.savefig(f"scores-art-end.pdf")
        plt.clf()
        
        df_end_clusters = pd.DataFrame.from_records(records_end_clusters)
        df_end_clusters.pivot(index="vigilance", columns="metric", values="n_forms").to_csv("clusters-art-end.tex", sep="&", line_terminator = "\\\\\n")
        sns.lineplot(data=df_end_clusters, x="vigilance", y = "n_forms", hue="metric")
        plt.savefig(f"clusters-art-end.pdf")
        plt.clf()
    if iterated:
        print("Plotting graphs.")
        df_course_scores = pd.DataFrame.from_records(records_course_scores)
        sns.lineplot(data=df_course_scores, x="timestep", y = "score", hue="metric", style="vigilance")
        plt.savefig(f"scores-art-course-batch{batch_size}.pdf")
        plt.clf()
        
        df_course_clusters = pd.DataFrame.from_records(records_course_clusters)
        sns.lineplot(data=df_course_clusters, x="timestep", y = "n_forms", hue="metric", style="vigilance")
        plt.savefig(f"clusters-art-course-batch{batch_size}.pdf")
        plt.clf()
        print("Done plotting.")

art_one(forms_empty_inflections_onehot, inflections, cogids, vigilances = np.arange(0,1.05,0.05), data_plot=True)



## ART iterated, for different batch sizes

In [None]:
for bs in [10,20,50,100,200,500]:
    print(f"Batch size: {bs}")
    art_iterated(forms_empty_inflections_onehot, n_runs=20, n_timesteps=500, batch_size=bs, inflections_gold=inflections, cogids=cogids, vigilances=[0.25, 0.5, 0.75] )


In [None]:
# TODO:
# Create iterated-learnnign() method, with repeated ART
# Evaluate generalization parameter
# Evaluated clustering using B-Cubed, or method corresponding to sklearn clustering section.Evaluation per inflection class
# Multiple runs, statistics

In [None]:
## Iff Language_of_the_etymon is Latin, it has inflection classes
# print(cognates_df[~cognates_df["Latin_Conjugation"].isnull()]["Language_of_the_etymon"].unique()) # only if
# print(cognates_df[cognates_df["Latin_Conjugation"].isnull()]["Language_of_the_etymon"].unique()) # if

## Get lemma for every form in cognateset id list
#cognates_df_lookup = cognates_df.set_index("ID")
#forms_df["inflection_classes"] = forms_df["Cognateset_ID"].apply(lambda cids: [cognates_df_lookup.loc[cid]["Latin_Conjugation"] for cid in cids])